src/cpu/trace/trace_cpu.hh

   1 /*
   2  * Copyright (c) 2013 - 2015 ARM Limited
   3  * All rights reserved
   4  *
   5  * The license below extends only to copyright in the software and shall
   6  * not be construed as granting a license to any other intellectual
   7  * property including but not limited to intellectual property relating
   8  * to a hardware implementation of the functionality of the software
   9  * licensed hereunder.  You may use the software subject to the license
  10  * terms below provided that you ensure that this notice is replicated
  11  * unmodified and in its entirety in all distributions of the software,
  12  * modified or unmodified, in source code or in binary form.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions are
  16  * met: redistributions of source code must retain the above copyright
  17  * notice, this list of conditions and the following disclaimer;
  18  * redistributions in binary form must reproduce the above copyright
  19  * notice, this list of conditions and the following disclaimer in the
  20  * documentation and/or other materials provided with the distribution;
  21  * neither the name of the copyright holders nor the names of its
  22  * contributors may be used to endorse or promote products derived from
  23  * this software without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  * Authors: Radhika Jagtap
  38  *          Andreas Hansson
  39  *          Thomas Grass
  40  */
  41
  42 #ifndef __CPU_TRACE_TRACE_CPU_HH__
  43 #define __CPU_TRACE_TRACE_CPU_HH__
  44
  45 #include <array>
  46 #include <cstdint>
  47 #include <queue>
  48 #include <set>
  49 #include <unordered_map>
  50
  51 #include "arch/registers.hh"
  52 #include "base/statistics.hh"
  53 #include "cpu/base.hh"
  54 #include "debug/TraceCPUData.hh"
  55 #include "debug/TraceCPUInst.hh"
  56 #include "params/TraceCPU.hh"
  57 #include "proto/inst_dep_record.pb.h"
  58 #include "proto/packet.pb.h"
  59 #include "proto/protoio.hh"
  60 #include "sim/sim_events.hh"
  61
  62 /**
  63  * The trace cpu replays traces generated using the elastic trace probe
  64  * attached to the O3 CPU model. The elastic trace is an execution trace with
  65  * register data dependencies and ordering dependencies annotated to it. The
  66  * trace cpu also replays a fixed timestamp fetch trace that is also generated
  67  * by the elastic trace probe. This trace cpu model aims at achieving faster
  68  * simulation compared to the detailed cpu model and good correlation when the
  69  * same trace is used for playback on different memory sub-systems.
  70  *
  71  * The TraceCPU inherits from BaseCPU so some virtual methods need to be
  72  * defined. It has two port subclasses inherited from MasterPort for
  73  * instruction and data ports. It issues the memory requests deducing the
  74  * timing from the trace and without performing real execution of micro-ops. As
  75  * soon as the last dependency for an instruction is complete, its
  76  * computational delay, also provided in the input trace is added. The
  77  * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
  78  * by ready time. Instructions which depend on load stall until the responses
  79  * for read requests are received thus achieving elastic replay. If the
  80  * dependency is not found when adding a new node, it is assumed complete.
  81  * Thus, if this node is found to be completely dependency-free its issue time
  82  * is calculated and it is added to the ready list immediately. This is
  83  * encapsulated in the subclass ElasticDataGen.
  84  *
  85  * If ready nodes are issued in an unconstrained way there can be more nodes
  86  * outstanding which results in divergence in timing compared to the O3CPU.
  87  * Therefore, the Trace CPU also models hardware resources. A sub-class to
  88  * model hardware resources contains the maximum sizes of load buffer, store
  89  * buffer and ROB. If resources are not available, the node is not issued. Such
  90  * nodes that are pending issue are held in the 'depFreeQueue' structure.
  91  *
  92  * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
  93  * the most important parameter of all resources. The ROB occupancy is
  94  * estimated using the newly added field 'robNum'. We need to use ROB number as
  95  * sequence number is at times much higher due to squashing and trace replay is
  96  * focused on correct path modeling.
  97  *
  98  * A map called 'inFlightNodes' is added to track nodes that are not only in
  99  * the readyList but also load nodes that are executed (and thus removed from
 100  * readyList) but are not complete. ReadyList handles what and when to execute
 101  * next node while the inFlightNodes is used for resource modelling. The oldest
 102  * ROB number is updated when any node occupies the ROB or when an entry in the
 103  * ROB is released. The ROB occupancy is equal to the difference in the ROB
 104  * number of the newly dependency-free node and the oldest ROB number in
 105  * flight.
 106  *
 107  * If no node depends on a non load/store node then there is no reason to
 108  * track it in the dependency graph. We filter out such nodes but count them
 109  * and add a weight field to the subsequent node that we do include in the
 110  * trace. The weight field is used to model ROB occupancy during replay.
 111  *
 112  * The depFreeQueue is chosen to be FIFO so that child nodes which are in
 113  * program order get pushed into it in that order and thus issued in program
 114  * order, like in the O3CPU. This is also why the dependents is made a
 115  * sequential container, std::set to std::vector. We only check head of the
 116  * depFreeQueue as nodes are issued in order and blocking on head models that
 117  * better than looping the entire queue. An alternative choice would be to
 118  * inspect top N pending nodes where N is the issue-width. This is left for
 119  * future as the timing correlation looks good as it is.
 120  *
 121  * At the start of an execution event, first we attempt to issue such pending
 122  * nodes by checking if appropriate resources have become available. If yes, we
 123  * compute the execute tick with respect to the time then. Then we proceed to
 124  * complete nodes from the readyList.
 125  *
 126  * When a read response is received, sometimes a dependency on it that was
 127  * supposed to be released when it was issued is still not released. This
 128  * occurs because the dependent gets added to the graph after the read was
 129  * sent. So the check is made less strict and the dependency is marked complete
 130  * on read response instead of insisting that it should have been removed on
 131  * read sent.
 132  *
 133  * There is a check for requests spanning two cache lines as this condition
 134  * triggers an assert fail in the L1 cache. If it does then truncate the size
 135  * to access only until the end of that line and ignore the remainder.
 136  * Strictly-ordered requests are skipped and the dependencies on such requests
 137  * are handled by simply marking them complete immediately.
 138  *
 139  * The simulated seconds can be calculated as the difference between the
 140  * final_tick stat and the tickOffset stat. A CountedExitEvent that contains a
 141  * static int belonging to the Trace CPU class as a down counter is used to
 142  * implement multi Trace CPU simulation exit.
 143  */
 144
 145 class TraceCPU : public BaseCPU
 146 {
 147
 148   public:
 149     TraceCPU(TraceCPUParams *params);
 150     ~TraceCPU();
 151
 152     void init();
 153
 154     /**
 155      * This is a pure virtual function in BaseCPU. As we don't know how many
 156      * insts are in the trace but only know how how many micro-ops are we
 157      * cannot count this stat.
 158      *
 159      * @return 0
 160      */
 161     Counter totalInsts() const
 162     {
 163         return 0;
 164     }
 165
 166     /**
 167      * Return totalOps as the number of committed micro-ops plus the
 168      * speculatively issued loads that are modelled in the TraceCPU replay.
 169      *
 170      * @return number of micro-ops i.e. nodes in the elastic data generator
 171      */
 172     Counter totalOps() const
 173     {
 174         return dcacheGen.getMicroOpCount();
 175     }
 176
 177     /* Pure virtual function in BaseCPU. Do nothing. */
 178     void wakeup(ThreadID tid = 0)
 179     {
 180         return;
 181     }
 182
 183     /*
 184      * When resuming from checkpoint in FS mode, the TraceCPU takes over from
 185      * the old cpu. This function overrides the takeOverFrom() function in the
 186      * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
 187      * TraceCPU.
 188      */
 189     void takeOverFrom(BaseCPU *oldCPU);
 190
 191     /**
 192      * When instruction cache port receives a retry, schedule event
 193      * icacheNextEvent.
 194      */
 195     void icacheRetryRecvd();
 196
 197     /**
 198      * When data cache port receives a retry, schedule event
 199      * dcacheNextEvent.
 200      */
 201     void dcacheRetryRecvd();
 202
 203     /**
 204      * When data cache port receives a response, this calls the dcache
 205      * generator method handle to complete the load writeback.
 206      *
 207      * @param pkt Pointer to packet received
 208      */
 209     void dcacheRecvTimingResp(PacketPtr pkt);
 210
 211     /**
 212      * Schedule event dcacheNextEvent at the given tick
 213      *
 214      * @param when Tick at which to schedule event
 215      */
 216     void schedDcacheNextEvent(Tick when);
 217
 218   protected:
 219
 220     /**
 221      * IcachePort class that interfaces with L1 Instruction Cache.
 222      */
 223     class IcachePort : public MasterPort
 224     {
 225       public:
 226         /** Default constructor. */
 227         IcachePort(TraceCPU* _cpu)
 228             : MasterPort(_cpu->name() + ".icache_port", _cpu),
 229                          owner(_cpu)
 230         { }
 231
 232       public:
 233         /**
 234          * Receive the timing reponse and simply delete the packet since
 235          * instruction fetch requests are issued as per the timing in the trace
 236          * and responses are ignored.
 237          *
 238          * @param pkt Pointer to packet received
 239          * @return true
 240          */
 241         bool recvTimingResp(PacketPtr pkt);
 242
 243         /**
 244          * Required functionally but do nothing.
 245          *
 246          * @param pkt Pointer to packet received
 247          */
 248         void recvTimingSnoopReq(PacketPtr pkt) { }
 249
 250         /**
 251          * Handle a retry signalled by the cache if instruction read failed in
 252          * the first attempt.
 253          */
 254         void recvReqRetry();
 255
 256       private:
 257         TraceCPU* owner;
 258     };
 259
 260     /**
 261      * DcachePort class that interfaces with L1 Data Cache.
 262      */
 263     class DcachePort : public MasterPort
 264     {
 265
 266       public:
 267         /** Default constructor. */
 268         DcachePort(TraceCPU* _cpu)
 269             : MasterPort(_cpu->name() + ".dcache_port", _cpu),
 270                          owner(_cpu)
 271         { }
 272
 273       public:
 274
 275         /**
 276          * Receive the timing reponse and call dcacheRecvTimingResp() method
 277          * of the dcacheGen to handle completing the load
 278          *
 279          * @param pkt Pointer to packet received
 280          * @return true
 281          */
 282         bool recvTimingResp(PacketPtr pkt);
 283
 284         /**
 285          * Required functionally but do nothing.
 286          *
 287          * @param pkt Pointer to packet received
 288          */
 289         void recvTimingSnoopReq(PacketPtr pkt)
 290         { }
 291
 292         /**
 293          * Required functionally but do nothing.
 294          *
 295          * @param pkt Pointer to packet received
 296          */
 297         void recvFunctionalSnoop(PacketPtr pkt)
 298         { }
 299
 300         /**
 301          * Handle a retry signalled by the cache if data access failed in the
 302          * first attempt.
 303          */
 304         void recvReqRetry();
 305
 306         /**
 307          * Required functionally.
 308          *
 309          * @return true since we have to snoop
 310          */
 311         bool isSnooping() const { return true; }
 312
 313       private:
 314         TraceCPU* owner;
 315     };
 316
 317     /** Port to connect to L1 instruction cache. */
 318     IcachePort icachePort;
 319
 320     /** Port to connect to L1 data cache. */
 321     DcachePort dcachePort;
 322
 323     /** Master id for instruction read requests. */
 324     const MasterID instMasterID;
 325
 326     /** Master id for data read and write requests. */
 327     const MasterID dataMasterID;
 328
 329     /** File names for input instruction and data traces. */
 330     std::string instTraceFile, dataTraceFile;
 331
 332     /**
 333      * Generator to read protobuf trace containing memory requests at fixed
 334      * timestamps, perform flow control and issue memory requests. If L1 cache
 335      * port sends packet succesfully, determine the tick to send the next
 336      * packet else wait for retry from cache.
 337      */
 338     class FixedRetryGen
 339     {
 340
 341       private:
 342
 343         /**
 344          * This struct stores a line in the trace file.
 345          */
 346         struct TraceElement {
 347
 348             /** Specifies if the request is to be a read or a write */
 349             MemCmd cmd;
 350
 351             /** The address for the request */
 352             Addr addr;
 353
 354             /** The size of the access for the request */
 355             Addr blocksize;
 356
 357             /** The time at which the request should be sent */
 358             Tick tick;
 359
 360             /** Potential request flags to use */
 361             Request::FlagsType flags;
 362
 363             /** Instruction PC */
 364             Addr pc;
 365
 366             /**
 367              * Check validity of this element.
 368              *
 369              * @return if this element is valid
 370              */
 371             bool isValid() const {
 372                 return cmd != MemCmd::InvalidCmd;
 373             }
 374
 375             /**
 376              * Make this element invalid.
 377              */
 378             void clear() {
 379                 cmd = MemCmd::InvalidCmd;
 380             }
 381         };
 382
 383         /**
 384          * The InputStream encapsulates a trace file and the
 385          * internal buffers and populates TraceElements based on
 386          * the input.
 387          */
 388         class InputStream
 389         {
 390
 391           private:
 392
 393             // Input file stream for the protobuf trace
 394             ProtoInputStream trace;
 395
 396           public:
 397
 398             /**
 399              * Create a trace input stream for a given file name.
 400              *
 401              * @param filename Path to the file to read from
 402              */
 403             InputStream(const std::string& filename);
 404
 405             /**
 406              * Reset the stream such that it can be played once
 407              * again.
 408              */
 409             void reset();
 410
 411             /**
 412              * Attempt to read a trace element from the stream,
 413              * and also notify the caller if the end of the file
 414              * was reached.
 415              *
 416              * @param element Trace element to populate
 417              * @return True if an element could be read successfully
 418              */
 419             bool read(TraceElement* element);
 420         };
 421
 422         public:
 423         /* Constructor */
 424         FixedRetryGen(TraceCPU& _owner, const std::string& _name,
 425                    MasterPort& _port, MasterID master_id,
 426                    const std::string& trace_file)
 427             : owner(_owner),
 428               port(_port),
 429               masterID(master_id),
 430               trace(trace_file),
 431               genName(owner.name() + ".fixedretry" + _name),
 432               retryPkt(nullptr),
 433               delta(0),
 434               traceComplete(false)
 435         {
 436         }
 437
 438         /**
 439          * Called from TraceCPU init(). Reads the first message from the
 440          * input trace file and returns the send tick.
 441          *
 442          * @return Tick when first packet must be sent
 443          */
 444         Tick init();
 445
 446         /**
 447          * This tries to send current or retry packet and returns true if
 448          * successfull. It calls nextExecute() to read next message.
 449          *
 450          * @return bool true if packet is sent successfully
 451          */
 452         bool tryNext();
 453
 454         /** Returns name of the FixedRetryGen instance. */
 455         const std::string& name() const { return genName; }
 456
 457         /**
 458          * Creates a new request assigning the request parameters passed by the
 459          * arguments. Calls the port's sendTimingReq() and returns true if
 460          * the packet was sent succesfully. It is called by tryNext()
 461          *
 462          * @param addr address of request
 463          * @param size size of request
 464          * @param cmd if it is a read or write request
 465          * @param flags associated request flags
 466          * @param pc instruction PC that generated the request
 467          *
 468          * @return true if packet was sent successfully
 469          */
 470         bool send(Addr addr, unsigned size, const MemCmd& cmd,
 471               Request::FlagsType flags, Addr pc);
 472
 473         /** Exit the FixedRetryGen. */
 474         void exit();
 475
 476         /**
 477          * Reads a line of the trace file. Returns the tick
 478          * when the next request should be generated. If the end
 479          * of the file has been reached, it returns false.
 480          *
 481          * @return bool false id end of file has been reached
 482          */
 483         bool nextExecute();
 484
 485         /**
 486          * Returns the traceComplete variable which is set when end of the
 487          * input trace file is reached.
 488          *
 489          * @return bool true if traceComplete is set, false otherwise.
 490          */
 491         bool isTraceComplete() { return traceComplete; }
 492
 493         int64_t tickDelta() { return delta; }
 494
 495         void regStats();
 496
 497       private:
 498
 499         /** Reference of the TraceCPU. */
 500         TraceCPU& owner;
 501
 502         /** Reference of the port to be used to issue memory requests. */
 503         MasterPort& port;
 504
 505         /** MasterID used for the requests being sent. */
 506         const MasterID masterID;
 507
 508         /** Input stream used for reading the input trace file. */
 509         InputStream trace;
 510
 511         /** String to store the name of the FixedRetryGen. */
 512         std::string genName;
 513
 514         /** PacketPtr used to store the packet to retry. */
 515         PacketPtr retryPkt;
 516
 517         /**
 518          * Stores the difference in the send ticks of the current and last
 519          * packets. Keeping this signed to check overflow to a negative value
 520          * which will be caught by assert(delta > 0)
 521          */
 522         int64_t delta;
 523
 524         /**
 525          * Set to true when end of trace is reached.
 526          */
 527         bool traceComplete;
 528
 529         /** Store an element read from the trace to send as the next packet. */
 530         TraceElement currElement;
 531
 532         /** Stats for instruction accesses replayed. */
 533         Stats::Scalar numSendAttempted;
 534         Stats::Scalar numSendSucceeded;
 535         Stats::Scalar numSendFailed;
 536         Stats::Scalar numRetrySucceeded;
 537         /** Last simulated tick by the FixedRetryGen */
 538         Stats::Scalar instLastTick;
 539
 540     };
 541
 542     /**
 543      * The elastic data memory request generator to read protobuf trace
 544      * containing execution trace annotated with data and ordering
 545      * dependencies. It deduces the time at which to send a load/store request
 546      * by tracking the dependencies. It attempts to send a memory request for a
 547      * load/store without performing real execution of micro-ops. If L1 cache
 548      * port sends packet succesfully, the generator checks which instructions
 549      * became dependency free as a result of this and schedules an event
 550      * accordingly. If it fails to send the packet, it waits for a retry from
 551      * the cache.
 552      */
 553     class ElasticDataGen
 554     {
 555
 556       private:
 557
 558         /** Node sequence number type. */
 559         typedef uint64_t NodeSeqNum;
 560
 561         /** Node ROB number type. */
 562         typedef uint64_t NodeRobNum;
 563
 564         /**
 565          * The struct GraphNode stores an instruction in the trace file. The
 566          * format of the trace file favours constructing a dependency graph of
 567          * the execution and this struct is used to encapsulate the request
 568          * data as well as pointers to its dependent GraphNodes.
 569          */
 570         class GraphNode {
 571
 572           public:
 573             /**
 574              * The maximum no. of ROB dependencies. There can be at most 2
 575              * order dependencies which could exist for a store. For a load
 576              * and comp node there can be at most one order dependency.
 577              */
 578             static const uint8_t maxRobDep = 2;
 579
 580             /** Typedef for the array containing the ROB dependencies */
 581             typedef std::array<NodeSeqNum, maxRobDep> RobDepArray;
 582
 583             /** Typedef for the array containing the register dependencies */
 584             typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray;
 585
 586             /** Instruction sequence number */
 587             NodeSeqNum seqNum;
 588
 589             /** ROB occupancy number */
 590             NodeRobNum robNum;
 591
 592             /** If instruction is a load */
 593             bool isLoad;
 594
 595             /** If instruction is a store */
 596             bool isStore;
 597
 598             /** The address for the request if any */
 599             Addr addr;
 600
 601             /** Size of request if any */
 602             uint32_t size;
 603
 604             /** Request flags if any */
 605             Request::Flags flags;
 606
 607             /** Instruction PC */
 608             Addr pc;
 609
 610             /** Array of order dependencies. */
 611             RobDepArray robDep;
 612
 613             /** Number of order dependencies */
 614             uint8_t numRobDep;
 615
 616             /** Computational delay */
 617             uint64_t compDelay;
 618
 619             /**
 620              * Array of register dependencies (incoming) if any. Maximum number
 621              * of source registers used to set maximum size of the array
 622              */
 623             RegDepArray regDep;
 624
 625             /** Number of register dependencies */
 626             uint8_t numRegDep;
 627
 628             /**
 629              * A vector of nodes dependent (outgoing) on this node. A
 630              * sequential container is chosen because when dependents become
 631              * free, they attempt to issue in program order.
 632              */
 633             std::vector<GraphNode *> dependents;
 634
 635             /** Initialize register dependency array to all zeroes */
 636             void clearRegDep();
 637
 638             /** Initialize register dependency array to all zeroes */
 639             void clearRobDep();
 640
 641             /** Remove completed instruction from register dependency array */
 642             bool removeRegDep(NodeSeqNum reg_dep);
 643
 644             /** Remove completed instruction from order dependency array */
 645             bool removeRobDep(NodeSeqNum rob_dep);
 646
 647             /** Check for all dependencies on completed inst */
 648             bool removeDepOnInst(NodeSeqNum done_seq_num);
 649
 650             /** Return true if node has a request which is strictly ordered */
 651             bool isStrictlyOrdered() const {
 652                 return (flags.isSet(Request::STRICT_ORDER));
 653             }
 654             /**
 655              * Write out element in trace-compatible format using debug flag
 656              * TraceCPUData.
 657              */
 658             void writeElementAsTrace() const;
 659         };
 660
 661         /** Struct to store a ready-to-execute node and its execution tick. */
 662         struct ReadyNode
 663         {
 664             /** The sequence number of the ready node */
 665             NodeSeqNum seqNum;
 666
 667             /** The tick at which the ready node must be executed */
 668             Tick execTick;
 669         };
 670
 671         /**
 672          * The HardwareResource class models structures that hold the in-flight
 673          * nodes. When a node becomes dependency free, first check if resources
 674          * are available to issue it.
 675          */
 676         class HardwareResource
 677         {
 678           public:
 679             /**
 680              * Constructor that initializes the sizes of the structures.
 681              *
 682              * @param max_rob size of the Reorder Buffer
 683              * @param max_stores size of Store Buffer
 684              * @param max_loads size of Load Buffer
 685              */
 686             HardwareResource(uint16_t max_rob, uint16_t max_stores,
 687                                 uint16_t max_loads);
 688
 689             /**
 690              * Occupy appropriate structures for an issued node.
 691              *
 692              * @param node_ptr pointer to the issued node
 693              */
 694             void occupy(const GraphNode* new_node);
 695
 696             /**
 697              * Release appropriate structures for a completed node.
 698              *
 699              * @param node_ptr pointer to the completed node
 700              */
 701             void release(const GraphNode* done_node);
 702
 703             /** Release store buffer entry for a completed store */
 704             void releaseStoreBuffer();
 705
 706             /**
 707              * Check if structures required to issue a node are free.
 708              *
 709              * @param node_ptr pointer to the node ready to issue
 710              * @return true if resources are available
 711              */
 712             bool isAvailable(const GraphNode* new_node) const;
 713
 714             /**
 715              * Check if there are any outstanding requests, i.e. requests for
 716              * which we are yet to receive a response.
 717              *
 718              * @return true if there is at least one read or write request
 719              *      outstanding
 720              */
 721             bool awaitingResponse() const;
 722
 723             /** Print resource occupancy for debugging */
 724             void printOccupancy();
 725
 726           private:
 727             /**
 728              * The size of the ROB used to throttle the max. number of in-flight
 729              * nodes.
 730              */
 731             const uint16_t sizeROB;
 732
 733             /**
 734              * The size of store buffer. This is used to throttle the max. number
 735              * of in-flight stores.
 736              */
 737             const uint16_t sizeStoreBuffer;
 738
 739             /**
 740              * The size of load buffer. This is used to throttle the max. number
 741              * of in-flight loads.
 742              */
 743             const uint16_t sizeLoadBuffer;
 744
 745             /**
 746              * A map from the sequence number to the ROB number of the in-
 747              * flight nodes. This includes all nodes that are in the readyList
 748              * plus the loads for which a request has been sent which are not
 749              * present in the readyList. But such loads are not yet complete
 750              * and thus occupy resources. We need to query the oldest in-flight
 751              * node and since a map container keeps all its keys sorted using
 752              * the less than criterion, the first element is the in-flight node
 753              * with the least sequence number, i.e. the oldest in-flight node.
 754              */
 755             std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
 756
 757             /** The ROB number of the oldest in-flight node */
 758             NodeRobNum oldestInFlightRobNum;
 759
 760             /** Number of ready loads for which request may or may not be sent */
 761             uint16_t numInFlightLoads;
 762
 763             /** Number of ready stores for which request may or may not be sent */
 764             uint16_t numInFlightStores;
 765         };
 766
 767         /**
 768          * The InputStream encapsulates a trace file and the
 769          * internal buffers and populates GraphNodes based on
 770          * the input.
 771          */
 772         class InputStream
 773         {
 774
 775           private:
 776
 777             /** Input file stream for the protobuf trace */
 778             ProtoInputStream trace;
 779
 780             /** Count of committed ops read from trace plus the filtered ops */
 781             uint64_t microOpCount;
 782
 783             /**
 784              * The window size that is read from the header of the protobuf
 785              * trace and used to process the dependency trace
 786              */
 787             uint32_t windowSize;
 788           public:
 789
 790             /**
 791              * Create a trace input stream for a given file name.
 792              *
 793              * @param filename Path to the file to read from
 794              */
 795             InputStream(const std::string& filename);
 796
 797             /**
 798              * Reset the stream such that it can be played once
 799              * again.
 800              */
 801             void reset();
 802
 803             /**
 804              * Attempt to read a trace element from the stream,
 805              * and also notify the caller if the end of the file
 806              * was reached.
 807              *
 808              * @param element Trace element to populate
 809              * @param size of register dependency array stored in the element
 810              * @return True if an element could be read successfully
 811              */
 812             bool read(GraphNode* element);
 813
 814             /** Get window size from trace */
 815             uint32_t getWindowSize() const { return windowSize; }
 816
 817             /** Get number of micro-ops modelled in the TraceCPU replay */
 818             uint64_t getMicroOpCount() const { return microOpCount; }
 819         };
 820
 821         public:
 822         /* Constructor */
 823         ElasticDataGen(TraceCPU& _owner, const std::string& _name,
 824                    MasterPort& _port, MasterID master_id,
 825                    const std::string& trace_file, uint16_t max_rob,
 826                    uint16_t max_stores, uint16_t max_loads)
 827             : owner(_owner),
 828               port(_port),
 829               masterID(master_id),
 830               trace(trace_file),
 831               genName(owner.name() + ".elastic" + _name),
 832               retryPkt(nullptr),
 833               traceComplete(false),
 834               nextRead(false),
 835               execComplete(false),
 836               windowSize(trace.getWindowSize()),
 837               hwResource(max_rob, max_stores, max_loads)
 838         {
 839             DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
 840                     windowSize);
 841         }
 842
 843         /**
 844          * Called from TraceCPU init(). Reads the first message from the
 845          * input trace file and returns the send tick.
 846          *
 847          * @return Tick when first packet must be sent
 848          */
 849         Tick init();
 850
 851         /** Returns name of the ElasticDataGen instance. */
 852         const std::string& name() const { return genName; }
 853
 854         /** Exit the ElasticDataGen. */
 855         void exit();
 856
 857         /**
 858          * Reads a line of the trace file. Returns the tick when the next
 859          * request should be generated. If the end of the file has been
 860          * reached, it returns false.
 861          *
 862          * @return bool false if end of file has been reached else true
 863          */
 864         bool readNextWindow();
 865
 866         /**
 867          * Iterate over the dependencies of a new node and add the new node
 868          * to the list of dependents of the parent node.
 869          *
 870          * @param   new_node    new node to add to the graph
 871          * @tparam  dep_array   the dependency array of type rob or register,
 872          *                      that is to be iterated, and may get modified
 873          * @param   num_dep     the number of dependencies set in the array
 874          *                      which may get modified during iteration
 875          */
 876         template<typename T> void addDepsOnParent(GraphNode *new_node,
 877                                                     T& dep_array,
 878                                                     uint8_t& num_dep);
 879
 880         /**
 881          * This is the main execute function which consumes nodes from the
 882          * sorted readyList. First attempt to issue the pending dependency-free
 883          * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
 884          * the readyList. Then iterate through the readyList and when a node
 885          * has its execute tick equal to curTick(), execute it. If the node is
 886          * a load or a store call executeMemReq() and if it is neither, simply
 887          * mark it complete.
 888          */
 889         void execute();
 890
 891         /**
 892          * Creates a new request for a load or store assigning the request
 893          * parameters. Calls the port's sendTimingReq() and returns a packet
 894          * if the send failed so that it can be saved for a retry.
 895          *
 896          * @param node_ptr pointer to the load or store node to be executed
 897          *
 898          * @return packet pointer if the request failed and nullptr if it was
 899          *          sent successfully
 900          */
 901         PacketPtr executeMemReq(GraphNode* node_ptr);
 902
 903         /**
 904          * Add a ready node to the readyList. When inserting, ensure the nodes
 905          * are sorted in ascending order of their execute ticks.
 906          *
 907          * @param seq_num seq. num of ready node
 908          * @param exec_tick the execute tick of the ready node
 909          */
 910         void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
 911
 912         /** Print readyList for debugging using debug flag TraceCPUData. */
 913         void printReadyList();
 914
 915         /**
 916          * When a load writeback is received, that is when the load completes,
 917          * release the dependents on it. This is called from the dcache port
 918          * recvTimingResp().
 919          */
 920         void completeMemAccess(PacketPtr pkt);
 921
 922         /**
 923          * Returns the execComplete variable which is set when the last
 924          * node is executed.
 925          *
 926          * @return bool true if execComplete is set, false otherwise.
 927          */
 928         bool isExecComplete() const { return execComplete; }
 929
 930         /**
 931          * Attempts to issue a node once the node's source dependencies are
 932          * complete. If resources are available then add it to the readyList,
 933          * otherwise the node is not issued and is stored in depFreeQueue
 934          * until resources become available.
 935          *
 936          * @param node_ptr pointer to node to be issued
 937          * @param first true if this is the first attempt to issue this node
 938          * @return true if node was added to readyList
 939          */
 940         bool checkAndIssue(const GraphNode* node_ptr, bool first = true);
 941
 942         /** Get number of micro-ops modelled in the TraceCPU replay */
 943         uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
 944
 945         void regStats();
 946
 947       private:
 948
 949         /** Reference of the TraceCPU. */
 950         TraceCPU& owner;
 951
 952         /** Reference of the port to be used to issue memory requests. */
 953         MasterPort& port;
 954
 955         /** MasterID used for the requests being sent. */
 956         const MasterID masterID;
 957
 958         /** Input stream used for reading the input trace file. */
 959         InputStream trace;
 960
 961         /** String to store the name of the FixedRetryGen. */
 962         std::string genName;
 963
 964         /** PacketPtr used to store the packet to retry. */
 965         PacketPtr retryPkt;
 966
 967         /** Set to true when end of trace is reached. */
 968         bool traceComplete;
 969
 970         /** Set to true when the next window of instructions need to be read */
 971         bool nextRead;
 972
 973         /** Set true when execution of trace is complete */
 974         bool execComplete;
 975
 976         /**
 977          * Window size within which to check for dependencies. Its value is
 978          * made equal to the window size used to generate the trace which is
 979          * recorded in the trace header. The dependency graph must be
 980          * populated enough such that when a node completes, its potential
 981          * child node must be found and the dependency removed before the
 982          * completed node itself is removed. Thus as soon as the graph shrinks
 983          * to become smaller than this window, we read in the next window.
 984          */
 985         const uint32_t windowSize;
 986
 987         /**
 988          * Hardware resources required to contain in-flight nodes and to
 989          * throttle issuing of new nodes when resources are not available.
 990          */
 991         HardwareResource hwResource;
 992
 993         /** Store the depGraph of GraphNodes */
 994         std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
 995
 996         /**
 997          * Queue of dependency-free nodes that are pending issue because
 998          * resources are not available. This is chosen to be FIFO so that
 999          * dependent nodes which become free in program order get pushed
1000          * into the queue in that order. Thus nodes are more likely to
1001          * issue in program order.
1002          */
1003         std::queue<const GraphNode*> depFreeQueue;
1004
1005         /** List of nodes that are ready to execute */
1006         std::list<ReadyNode> readyList;
1007
1008         /** Stats for data memory accesses replayed. */
1009         Stats::Scalar maxDependents;
1010         Stats::Scalar maxReadyListSize;
1011         Stats::Scalar numSendAttempted;
1012         Stats::Scalar numSendSucceeded;
1013         Stats::Scalar numSendFailed;
1014         Stats::Scalar numRetrySucceeded;
1015         Stats::Scalar numSplitReqs;
1016         Stats::Scalar numSOLoads;
1017         Stats::Scalar numSOStores;
1018         /** Tick when ElasticDataGen completes execution */
1019         Stats::Scalar dataLastTick;
1020     };
1021
1022     /** Instance of FixedRetryGen to replay instruction read requests. */
1023     FixedRetryGen icacheGen;
1024
1025     /** Instance of ElasticDataGen to replay data read and write requests. */
1026     ElasticDataGen dcacheGen;
1027
1028     /**
1029      * This is the control flow that uses the functionality of the icacheGen to
1030      * replay the trace. It calls tryNext(). If it returns true then next event
1031      * is scheduled at curTick() plus delta. If it returns false then delta is
1032      * ignored and control is brought back via recvRetry().
1033      */
1034     void schedIcacheNext();
1035
1036     /**
1037      * This is the control flow that uses the functionality of the dcacheGen to
1038      * replay the trace. It calls execute(). It checks if execution is complete
1039      * and schedules an event to exit simulation accordingly.
1040      */
1041     void schedDcacheNext();
1042
1043     /** Event for the control flow method schedIcacheNext() */
1044     EventWrapper<TraceCPU, &TraceCPU::schedIcacheNext> icacheNextEvent;
1045
1046     /** Event for the control flow method schedDcacheNext() */
1047     EventWrapper<TraceCPU, &TraceCPU::schedDcacheNext> dcacheNextEvent;
1048
1049     /** This is called when either generator finishes executing from the trace */
1050     void checkAndSchedExitEvent();
1051
1052     /** Set to true when one of the generators finishes replaying its trace. */
1053     bool oneTraceComplete;
1054
1055     /**
1056      * This is stores the tick of the first instruction fetch request
1057      * which is later used for dumping the tickOffset stat.
1058      */
1059     Tick firstFetchTick;
1060
1061     /**
1062      * Number of Trace CPUs in the system used as a shared variable and passed
1063      * to the CountedExitEvent event used for counting down exit events.  It is
1064      * incremented in the constructor call so that the total is arrived at
1065      * automatically.
1066      */
1067     static int numTraceCPUs;
1068
1069    /**
1070     * A CountedExitEvent which when serviced decrements the counter. A sim
1071     * exit event is scheduled when the counter equals zero, that is all
1072     * instances of Trace CPU have had their execCompleteEvent serviced.
1073     */
1074     CountedExitEvent *execCompleteEvent;
1075
1076     Stats::Scalar numSchedDcacheEvent;
1077     Stats::Scalar numSchedIcacheEvent;
1078
1079     /** Stat for number of simulated micro-ops. */
1080     Stats::Scalar numOps;
1081     /** Stat for the CPI. This is really cycles per micro-op and not inst. */
1082     Stats::Formula cpi;
1083
1084     /**
1085      * The first execution tick is dumped as a stat so that the simulated
1086      * seconds for a trace replay can be calculated as a difference between the
1087      * final_tick stat and the tickOffset stat
1088      */
1089     Stats::Scalar tickOffset;
1090
1091   public:
1092
1093     /** Used to get a reference to the icache port. */
1094     MasterPort &getInstPort() { return icachePort; }
1095
1096     /** Used to get a reference to the dcache port. */
1097     MasterPort &getDataPort() { return dcachePort; }
1098
1099     void regStats();
1100 };
1101 #endif // __CPU_TRACE_TRACE_CPU_HH__