cpu: Factor MaxInst(SrcDest)Regs out of the trace CPU.
[gem5.git] / src / cpu / trace / trace_cpu.hh
1 /*
2 * Copyright (c) 2013 - 2016 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 #ifndef __CPU_TRACE_TRACE_CPU_HH__
39 #define __CPU_TRACE_TRACE_CPU_HH__
40
41 #include <cstdint>
42 #include <list>
43 #include <queue>
44 #include <set>
45 #include <unordered_map>
46
47 #include "arch/registers.hh"
48 #include "base/statistics.hh"
49 #include "cpu/base.hh"
50 #include "debug/TraceCPUData.hh"
51 #include "debug/TraceCPUInst.hh"
52 #include "params/TraceCPU.hh"
53 #include "proto/inst_dep_record.pb.h"
54 #include "proto/packet.pb.h"
55 #include "proto/protoio.hh"
56 #include "sim/sim_events.hh"
57
58 /**
59 * The trace cpu replays traces generated using the elastic trace probe
60 * attached to the O3 CPU model. The elastic trace is an execution trace with
61 * register data dependencies and ordering dependencies annotated to it. The
62 * trace cpu also replays a fixed timestamp fetch trace that is also generated
63 * by the elastic trace probe. This trace cpu model aims at achieving faster
64 * simulation compared to the detailed cpu model and good correlation when the
65 * same trace is used for playback on different memory sub-systems.
66 *
67 * The TraceCPU inherits from BaseCPU so some virtual methods need to be
68 * defined. It has two port subclasses inherited from RequestPort for
69 * instruction and data ports. It issues the memory requests deducing the
70 * timing from the trace and without performing real execution of micro-ops. As
71 * soon as the last dependency for an instruction is complete, its
72 * computational delay, also provided in the input trace is added. The
73 * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
74 * by ready time. Instructions which depend on load stall until the responses
75 * for read requests are received thus achieving elastic replay. If the
76 * dependency is not found when adding a new node, it is assumed complete.
77 * Thus, if this node is found to be completely dependency-free its issue time
78 * is calculated and it is added to the ready list immediately. This is
79 * encapsulated in the subclass ElasticDataGen.
80 *
81 * If ready nodes are issued in an unconstrained way there can be more nodes
82 * outstanding which results in divergence in timing compared to the O3CPU.
83 * Therefore, the Trace CPU also models hardware resources. A sub-class to
84 * model hardware resources contains the maximum sizes of load buffer, store
85 * buffer and ROB. If resources are not available, the node is not issued. Such
86 * nodes that are pending issue are held in the 'depFreeQueue' structure.
87 *
88 * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
89 * the most important parameter of all resources. The ROB occupancy is
90 * estimated using the newly added field 'robNum'. We need to use ROB number as
91 * sequence number is at times much higher due to squashing and trace replay is
92 * focused on correct path modeling.
93 *
94 * A map called 'inFlightNodes' is added to track nodes that are not only in
95 * the readyList but also load nodes that are executed (and thus removed from
96 * readyList) but are not complete. ReadyList handles what and when to execute
97 * next node while the inFlightNodes is used for resource modelling. The oldest
98 * ROB number is updated when any node occupies the ROB or when an entry in the
99 * ROB is released. The ROB occupancy is equal to the difference in the ROB
100 * number of the newly dependency-free node and the oldest ROB number in
101 * flight.
102 *
103 * If no node depends on a non load/store node then there is no reason to
104 * track it in the dependency graph. We filter out such nodes but count them
105 * and add a weight field to the subsequent node that we do include in the
106 * trace. The weight field is used to model ROB occupancy during replay.
107 *
108 * The depFreeQueue is chosen to be FIFO so that child nodes which are in
109 * program order get pushed into it in that order and thus issued in program
110 * order, like in the O3CPU. This is also why the dependents is made a
111 * sequential container, std::set to std::vector. We only check head of the
112 * depFreeQueue as nodes are issued in order and blocking on head models that
113 * better than looping the entire queue. An alternative choice would be to
114 * inspect top N pending nodes where N is the issue-width. This is left for
115 * future as the timing correlation looks good as it is.
116 *
117 * At the start of an execution event, first we attempt to issue such pending
118 * nodes by checking if appropriate resources have become available. If yes, we
119 * compute the execute tick with respect to the time then. Then we proceed to
120 * complete nodes from the readyList.
121 *
122 * When a read response is received, sometimes a dependency on it that was
123 * supposed to be released when it was issued is still not released. This
124 * occurs because the dependent gets added to the graph after the read was
125 * sent. So the check is made less strict and the dependency is marked complete
126 * on read response instead of insisting that it should have been removed on
127 * read sent.
128 *
129 * There is a check for requests spanning two cache lines as this condition
130 * triggers an assert fail in the L1 cache. If it does then truncate the size
131 * to access only until the end of that line and ignore the remainder.
132 * Strictly-ordered requests are skipped and the dependencies on such requests
133 * are handled by simply marking them complete immediately.
134 *
135 * A CountedExitEvent that contains a static int belonging to the Trace CPU
136 * class as a down counter is used to implement multi Trace CPU simulation
137 * exit.
138 */
139
140 class TraceCPU : public BaseCPU
141 {
142
143 public:
144 TraceCPU(const TraceCPUParams &params);
145
146 void init();
147
148 /**
149 * This is a pure virtual function in BaseCPU. As we don't know how many
150 * insts are in the trace but only know how how many micro-ops are we
151 * cannot count this stat.
152 *
153 * @return 0
154 */
155 Counter totalInsts() const { return 0; }
156
157 /**
158 * Return totalOps as the number of committed micro-ops plus the
159 * speculatively issued loads that are modelled in the TraceCPU replay.
160 *
161 * @return number of micro-ops i.e. nodes in the elastic data generator
162 */
163 Counter totalOps() const { return traceStats.numOps.value(); }
164
165 /*
166 * Set the no. of ops when elastic data generator completes executing a
167 * node.
168 */
169 void updateNumOps(uint64_t rob_num);
170
171 /* Pure virtual function in BaseCPU. Do nothing. */
172 void wakeup(ThreadID tid=0) { return; }
173
174 /*
175 * When resuming from checkpoint in FS mode, the TraceCPU takes over from
176 * the old cpu. This function overrides the takeOverFrom() function in the
177 * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
178 * TraceCPU.
179 */
180 void takeOverFrom(BaseCPU *oldCPU);
181
182 /**
183 * When instruction cache port receives a retry, schedule event
184 * icacheNextEvent.
185 */
186 void icacheRetryRecvd();
187
188 /**
189 * When data cache port receives a retry, schedule event
190 * dcacheNextEvent.
191 */
192 void dcacheRetryRecvd();
193
194 /**
195 * When data cache port receives a response, this calls the dcache
196 * generator method handle to complete the load writeback.
197 *
198 * @param pkt Pointer to packet received
199 */
200 void dcacheRecvTimingResp(PacketPtr pkt);
201
202 /**
203 * Schedule event dcacheNextEvent at the given tick
204 *
205 * @param when Tick at which to schedule event
206 */
207 void schedDcacheNextEvent(Tick when);
208
209 protected:
210
211 /**
212 * IcachePort class that interfaces with L1 Instruction Cache.
213 */
214 class IcachePort : public RequestPort
215 {
216 public:
217 /** Default constructor. */
218 IcachePort(TraceCPU* _cpu) :
219 RequestPort(_cpu->name() + ".icache_port", _cpu), owner(_cpu)
220 {}
221
222 public:
223 /**
224 * Receive the timing reponse and simply delete the packet since
225 * instruction fetch requests are issued as per the timing in the trace
226 * and responses are ignored.
227 *
228 * @param pkt Pointer to packet received
229 * @return true
230 */
231 bool recvTimingResp(PacketPtr pkt);
232
233 /**
234 * Required functionally but do nothing.
235 *
236 * @param pkt Pointer to packet received
237 */
238 void recvTimingSnoopReq(PacketPtr pkt) {}
239
240 /**
241 * Handle a retry signalled by the cache if instruction read failed in
242 * the first attempt.
243 */
244 void recvReqRetry();
245
246 private:
247 TraceCPU* owner;
248 };
249
250 /**
251 * DcachePort class that interfaces with L1 Data Cache.
252 */
253 class DcachePort : public RequestPort
254 {
255
256 public:
257 /** Default constructor. */
258 DcachePort(TraceCPU* _cpu) :
259 RequestPort(_cpu->name() + ".dcache_port", _cpu), owner(_cpu)
260 {}
261
262 public:
263
264 /**
265 * Receive the timing reponse and call dcacheRecvTimingResp() method
266 * of the dcacheGen to handle completing the load
267 *
268 * @param pkt Pointer to packet received
269 * @return true
270 */
271 bool recvTimingResp(PacketPtr pkt);
272
273 /**
274 * Required functionally but do nothing.
275 *
276 * @param pkt Pointer to packet received
277 */
278 void recvTimingSnoopReq(PacketPtr pkt) {}
279
280 /**
281 * Required functionally but do nothing.
282 *
283 * @param pkt Pointer to packet received
284 */
285 void recvFunctionalSnoop(PacketPtr pkt) {}
286
287 /**
288 * Handle a retry signalled by the cache if data access failed in the
289 * first attempt.
290 */
291 void recvReqRetry();
292
293 /**
294 * Required functionally.
295 *
296 * @return true since we have to snoop
297 */
298 bool isSnooping() const { return true; }
299
300 private:
301 TraceCPU* owner;
302 };
303
304 /** Port to connect to L1 instruction cache. */
305 IcachePort icachePort;
306
307 /** Port to connect to L1 data cache. */
308 DcachePort dcachePort;
309
310 /** Requestor id for instruction read requests. */
311 const RequestorID instRequestorID;
312
313 /** Requestor id for data read and write requests. */
314 const RequestorID dataRequestorID;
315
316 /** File names for input instruction and data traces. */
317 std::string instTraceFile, dataTraceFile;
318
319 /**
320 * Generator to read protobuf trace containing memory requests at fixed
321 * timestamps, perform flow control and issue memory requests. If L1 cache
322 * port sends packet succesfully, determine the tick to send the next
323 * packet else wait for retry from cache.
324 */
325 class FixedRetryGen
326 {
327
328 private:
329
330 /**
331 * This struct stores a line in the trace file.
332 */
333 struct TraceElement
334 {
335
336 /** Specifies if the request is to be a read or a write */
337 MemCmd cmd;
338
339 /** The address for the request */
340 Addr addr;
341
342 /** The size of the access for the request */
343 Addr blocksize;
344
345 /** The time at which the request should be sent */
346 Tick tick;
347
348 /** Potential request flags to use */
349 Request::FlagsType flags;
350
351 /** Instruction PC */
352 Addr pc;
353
354 /**
355 * Check validity of this element.
356 *
357 * @return if this element is valid
358 */
359 bool isValid() const { return cmd != MemCmd::InvalidCmd; }
360
361 /**
362 * Make this element invalid.
363 */
364 void clear() { cmd = MemCmd::InvalidCmd; }
365 };
366
367 /**
368 * The InputStream encapsulates a trace file and the
369 * internal buffers and populates TraceElements based on
370 * the input.
371 */
372 class InputStream
373 {
374 private:
375 // Input file stream for the protobuf trace
376 ProtoInputStream trace;
377
378 public:
379 /**
380 * Create a trace input stream for a given file name.
381 *
382 * @param filename Path to the file to read from
383 */
384 InputStream(const std::string& filename);
385
386 /**
387 * Reset the stream such that it can be played once
388 * again.
389 */
390 void reset();
391
392 /**
393 * Attempt to read a trace element from the stream,
394 * and also notify the caller if the end of the file
395 * was reached.
396 *
397 * @param element Trace element to populate
398 * @return True if an element could be read successfully
399 */
400 bool read(TraceElement* element);
401 };
402
403 public:
404 /* Constructor */
405 FixedRetryGen(TraceCPU& _owner, const std::string& _name,
406 RequestPort& _port, RequestorID requestor_id,
407 const std::string& trace_file) :
408 owner(_owner),
409 port(_port),
410 requestorId(requestor_id),
411 trace(trace_file),
412 genName(owner.name() + ".fixedretry." + _name),
413 retryPkt(nullptr),
414 delta(0),
415 traceComplete(false), fixedStats(&_owner, _name)
416 {
417 }
418
419 /**
420 * Called from TraceCPU init(). Reads the first message from the
421 * input trace file and returns the send tick.
422 *
423 * @return Tick when first packet must be sent
424 */
425 Tick init();
426
427 /**
428 * This tries to send current or retry packet and returns true if
429 * successfull. It calls nextExecute() to read next message.
430 *
431 * @return bool true if packet is sent successfully
432 */
433 bool tryNext();
434
435 /** Returns name of the FixedRetryGen instance. */
436 const std::string& name() const { return genName; }
437
438 /**
439 * Creates a new request assigning the request parameters passed by the
440 * arguments. Calls the port's sendTimingReq() and returns true if
441 * the packet was sent succesfully. It is called by tryNext()
442 *
443 * @param addr address of request
444 * @param size size of request
445 * @param cmd if it is a read or write request
446 * @param flags associated request flags
447 * @param pc instruction PC that generated the request
448 *
449 * @return true if packet was sent successfully
450 */
451 bool send(Addr addr, unsigned size, const MemCmd& cmd,
452 Request::FlagsType flags, Addr pc);
453
454 /** Exit the FixedRetryGen. */
455 void exit();
456
457 /**
458 * Reads a line of the trace file. Returns the tick
459 * when the next request should be generated. If the end
460 * of the file has been reached, it returns false.
461 *
462 * @return bool false id end of file has been reached
463 */
464 bool nextExecute();
465
466 /**
467 * Returns the traceComplete variable which is set when end of the
468 * input trace file is reached.
469 *
470 * @return bool true if traceComplete is set, false otherwise.
471 */
472 bool isTraceComplete() { return traceComplete; }
473
474 int64_t tickDelta() { return delta; }
475
476 private:
477 /** Reference of the TraceCPU. */
478 TraceCPU& owner;
479
480 /** Reference of the port to be used to issue memory requests. */
481 RequestPort& port;
482
483 /** RequestorID used for the requests being sent. */
484 const RequestorID requestorId;
485
486 /** Input stream used for reading the input trace file. */
487 InputStream trace;
488
489 /** String to store the name of the FixedRetryGen. */
490 std::string genName;
491
492 /** PacketPtr used to store the packet to retry. */
493 PacketPtr retryPkt;
494
495 /**
496 * Stores the difference in the send ticks of the current and last
497 * packets. Keeping this signed to check overflow to a negative value
498 * which will be caught by assert(delta > 0)
499 */
500 int64_t delta;
501
502 /**
503 * Set to true when end of trace is reached.
504 */
505 bool traceComplete;
506
507 /** Store an element read from the trace to send as the next packet. */
508 TraceElement currElement;
509 protected:
510 struct FixedRetryGenStatGroup : public Stats::Group
511 {
512 /** name is the extension to the name for these stats */
513 FixedRetryGenStatGroup(Stats::Group *parent,
514 const std::string& _name);
515 /** Stats for instruction accesses replayed. */
516 Stats::Scalar numSendAttempted;
517 Stats::Scalar numSendSucceeded;
518 Stats::Scalar numSendFailed;
519 Stats::Scalar numRetrySucceeded;
520 /** Last simulated tick by the FixedRetryGen */
521 Stats::Scalar instLastTick;
522 } fixedStats;
523
524 };
525
526 /**
527 * The elastic data memory request generator to read protobuf trace
528 * containing execution trace annotated with data and ordering
529 * dependencies. It deduces the time at which to send a load/store request
530 * by tracking the dependencies. It attempts to send a memory request for a
531 * load/store without performing real execution of micro-ops. If L1 cache
532 * port sends packet succesfully, the generator checks which instructions
533 * became dependency free as a result of this and schedules an event
534 * accordingly. If it fails to send the packet, it waits for a retry from
535 * the cache.
536 */
537 class ElasticDataGen
538 {
539 private:
540 /** Node sequence number type. */
541 typedef uint64_t NodeSeqNum;
542
543 /** Node ROB number type. */
544 typedef uint64_t NodeRobNum;
545
546 typedef ProtoMessage::InstDepRecord::RecordType RecordType;
547 typedef ProtoMessage::InstDepRecord Record;
548
549 /**
550 * The struct GraphNode stores an instruction in the trace file. The
551 * format of the trace file favours constructing a dependency graph of
552 * the execution and this struct is used to encapsulate the request
553 * data as well as pointers to its dependent GraphNodes.
554 */
555 class GraphNode
556 {
557 public:
558 /** Typedef for the list containing the ROB dependencies */
559 typedef std::list<NodeSeqNum> RobDepList;
560
561 /** Typedef for the list containing the register dependencies */
562 typedef std::list<NodeSeqNum> RegDepList;
563
564 /** Instruction sequence number */
565 NodeSeqNum seqNum;
566
567 /** ROB occupancy number */
568 NodeRobNum robNum;
569
570 /**
571 * Type of the node corresponding to the instruction modeled by
572 * it.
573 */
574 RecordType type;
575
576 /** The address for the request if any */
577 Addr physAddr;
578
579 /** The virtual address for the request if any */
580 Addr virtAddr;
581
582 /** Size of request if any */
583 uint32_t size;
584
585 /** Request flags if any */
586 Request::Flags flags;
587
588 /** Instruction PC */
589 Addr pc;
590
591 /** List of order dependencies. */
592 RobDepList robDep;
593
594 /** Computational delay */
595 uint64_t compDelay;
596
597 /**
598 * List of register dependencies (incoming) if any. Maximum number
599 * of source registers used to set maximum size of the array
600 */
601 RegDepList regDep;
602
603 /**
604 * A vector of nodes dependent (outgoing) on this node. A
605 * sequential container is chosen because when dependents become
606 * free, they attempt to issue in program order.
607 */
608 std::vector<GraphNode *> dependents;
609
610 /** Is the node a load */
611 bool isLoad() const { return (type == Record::LOAD); }
612
613 /** Is the node a store */
614 bool isStore() const { return (type == Record::STORE); }
615
616 /** Is the node a compute (non load/store) node */
617 bool isComp() const { return (type == Record::COMP); }
618
619 /** Remove completed instruction from register dependency array */
620 bool removeRegDep(NodeSeqNum reg_dep);
621
622 /** Remove completed instruction from order dependency array */
623 bool removeRobDep(NodeSeqNum rob_dep);
624
625 /** Check for all dependencies on completed inst */
626 bool removeDepOnInst(NodeSeqNum done_seq_num);
627
628 /** Return true if node has a request which is strictly ordered */
629 bool
630 isStrictlyOrdered() const
631 {
632 return (flags.isSet(Request::STRICT_ORDER));
633 }
634 /**
635 * Write out element in trace-compatible format using debug flag
636 * TraceCPUData.
637 */
638 void writeElementAsTrace() const;
639
640 /** Return string specifying the type of the node */
641 std::string typeToStr() const;
642 };
643
644 /** Struct to store a ready-to-execute node and its execution tick. */
645 struct ReadyNode
646 {
647 /** The sequence number of the ready node */
648 NodeSeqNum seqNum;
649
650 /** The tick at which the ready node must be executed */
651 Tick execTick;
652 };
653
654 /**
655 * The HardwareResource class models structures that hold the in-flight
656 * nodes. When a node becomes dependency free, first check if resources
657 * are available to issue it.
658 */
659 class HardwareResource
660 {
661 public:
662 /**
663 * Constructor that initializes the sizes of the structures.
664 *
665 * @param max_rob size of the Reorder Buffer
666 * @param max_stores size of Store Buffer
667 * @param max_loads size of Load Buffer
668 */
669 HardwareResource(uint16_t max_rob, uint16_t max_stores,
670 uint16_t max_loads);
671
672 /**
673 * Occupy appropriate structures for an issued node.
674 *
675 * @param node_ptr pointer to the issued node
676 */
677 void occupy(const GraphNode* new_node);
678
679 /**
680 * Release appropriate structures for a completed node.
681 *
682 * @param node_ptr pointer to the completed node
683 */
684 void release(const GraphNode* done_node);
685
686 /** Release store buffer entry for a completed store */
687 void releaseStoreBuffer();
688
689 /**
690 * Check if structures required to issue a node are free.
691 *
692 * @param node_ptr pointer to the node ready to issue
693 * @return true if resources are available
694 */
695 bool isAvailable(const GraphNode* new_node) const;
696
697 /**
698 * Check if there are any outstanding requests, i.e. requests for
699 * which we are yet to receive a response.
700 *
701 * @return true if there is at least one read or write request
702 * outstanding
703 */
704 bool awaitingResponse() const;
705
706 /** Print resource occupancy for debugging. */
707 void printOccupancy();
708
709 private:
710 /**
711 * The size of the ROB used to throttle the max. number of
712 * in-flight nodes.
713 */
714 const uint16_t sizeROB;
715
716 /**
717 * The size of store buffer. This is used to throttle the max.
718 * number of in-flight stores.
719 */
720 const uint16_t sizeStoreBuffer;
721
722 /**
723 * The size of load buffer. This is used to throttle the max.
724 * number of in-flight loads.
725 */
726 const uint16_t sizeLoadBuffer;
727
728 /**
729 * A map from the sequence number to the ROB number of the in-
730 * flight nodes. This includes all nodes that are in the readyList
731 * plus the loads for which a request has been sent which are not
732 * present in the readyList. But such loads are not yet complete
733 * and thus occupy resources. We need to query the oldest in-flight
734 * node and since a map container keeps all its keys sorted using
735 * the less than criterion, the first element is the in-flight node
736 * with the least sequence number, i.e. the oldest in-flight node.
737 */
738 std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
739
740 /** The ROB number of the oldest in-flight node */
741 NodeRobNum oldestInFlightRobNum;
742
743 /** Number of ready loads for which request may or may not be
744 * sent.
745 */
746 uint16_t numInFlightLoads;
747
748 /** Number of ready stores for which request may or may not be
749 * sent.
750 */
751 uint16_t numInFlightStores;
752 };
753
754 /**
755 * The InputStream encapsulates a trace file and the
756 * internal buffers and populates GraphNodes based on
757 * the input.
758 */
759 class InputStream
760 {
761 private:
762 /** Input file stream for the protobuf trace */
763 ProtoInputStream trace;
764
765 /**
766 * A multiplier for the compute delays in the trace to modulate
767 * the Trace CPU frequency either up or down. The Trace CPU's
768 * clock domain frequency must also be set to match the expected
769 * result of frequency scaling.
770 */
771 const double timeMultiplier;
772
773 /** Count of committed ops read from trace plus the filtered ops */
774 uint64_t microOpCount;
775
776 /**
777 * The window size that is read from the header of the protobuf
778 * trace and used to process the dependency trace
779 */
780 uint32_t windowSize;
781
782 public:
783 /**
784 * Create a trace input stream for a given file name.
785 *
786 * @param filename Path to the file to read from
787 * @param time_multiplier used to scale the compute delays
788 */
789 InputStream(const std::string& filename,
790 const double time_multiplier);
791
792 /**
793 * Reset the stream such that it can be played once
794 * again.
795 */
796 void reset();
797
798 /**
799 * Attempt to read a trace element from the stream,
800 * and also notify the caller if the end of the file
801 * was reached.
802 *
803 * @param element Trace element to populate
804 * @param size of register dependency array stored in the element
805 * @return True if an element could be read successfully
806 */
807 bool read(GraphNode* element);
808
809 /** Get window size from trace */
810 uint32_t getWindowSize() const { return windowSize; }
811
812 /** Get number of micro-ops modelled in the TraceCPU replay */
813 uint64_t getMicroOpCount() const { return microOpCount; }
814 };
815
816 public:
817 /* Constructor */
818 ElasticDataGen(TraceCPU& _owner, const std::string& _name,
819 RequestPort& _port, RequestorID requestor_id,
820 const std::string& trace_file,
821 const TraceCPUParams &params) :
822 owner(_owner),
823 port(_port),
824 requestorId(requestor_id),
825 trace(trace_file, 1.0 / params.freqMultiplier),
826 genName(owner.name() + ".elastic." + _name),
827 retryPkt(nullptr),
828 traceComplete(false),
829 nextRead(false),
830 execComplete(false),
831 windowSize(trace.getWindowSize()),
832 hwResource(params.sizeROB, params.sizeStoreBuffer,
833 params.sizeLoadBuffer), elasticStats(&_owner, _name)
834 {
835 DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
836 windowSize);
837 }
838
839 /**
840 * Called from TraceCPU init(). Reads the first message from the
841 * input trace file and returns the send tick.
842 *
843 * @return Tick when first packet must be sent
844 */
845 Tick init();
846
847 /**
848 * Adjust traceOffset based on what TraceCPU init() determines on
849 * comparing the offsets in the fetch request and elastic traces.
850 *
851 * @param trace_offset trace offset set by comparing both traces
852 */
853 void adjustInitTraceOffset(Tick& offset);
854
855 /** Returns name of the ElasticDataGen instance. */
856 const std::string& name() const { return genName; }
857
858 /** Exit the ElasticDataGen. */
859 void exit();
860
861 /**
862 * Reads a line of the trace file. Returns the tick when the next
863 * request should be generated. If the end of the file has been
864 * reached, it returns false.
865 *
866 * @return bool false if end of file has been reached else true
867 */
868 bool readNextWindow();
869
870 /**
871 * Iterate over the dependencies of a new node and add the new node
872 * to the list of dependents of the parent node.
873 *
874 * @param new_node new node to add to the graph
875 * @tparam dep_list the dependency list of type rob or register,
876 * that is to be iterated, and may get modified
877 */
878 template<typename T>
879 void addDepsOnParent(GraphNode *new_node, T& dep_list);
880
881 /**
882 * This is the main execute function which consumes nodes from the
883 * sorted readyList. First attempt to issue the pending dependency-free
884 * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
885 * the readyList. Then iterate through the readyList and when a node
886 * has its execute tick equal to curTick(), execute it. If the node is
887 * a load or a store call executeMemReq() and if it is neither, simply
888 * mark it complete.
889 */
890 void execute();
891
892 /**
893 * Creates a new request for a load or store assigning the request
894 * parameters. Calls the port's sendTimingReq() and returns a packet
895 * if the send failed so that it can be saved for a retry.
896 *
897 * @param node_ptr pointer to the load or store node to be executed
898 *
899 * @return packet pointer if the request failed and nullptr if it was
900 * sent successfully
901 */
902 PacketPtr executeMemReq(GraphNode* node_ptr);
903
904 /**
905 * Add a ready node to the readyList. When inserting, ensure the nodes
906 * are sorted in ascending order of their execute ticks.
907 *
908 * @param seq_num seq. num of ready node
909 * @param exec_tick the execute tick of the ready node
910 */
911 void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
912
913 /** Print readyList for debugging using debug flag TraceCPUData. */
914 void printReadyList();
915
916 /**
917 * When a load writeback is received, that is when the load completes,
918 * release the dependents on it. This is called from the dcache port
919 * recvTimingResp().
920 */
921 void completeMemAccess(PacketPtr pkt);
922
923 /**
924 * Returns the execComplete variable which is set when the last
925 * node is executed.
926 *
927 * @return bool true if execComplete is set, false otherwise.
928 */
929 bool isExecComplete() const { return execComplete; }
930
931 /**
932 * Attempts to issue a node once the node's source dependencies are
933 * complete. If resources are available then add it to the readyList,
934 * otherwise the node is not issued and is stored in depFreeQueue
935 * until resources become available.
936 *
937 * @param node_ptr pointer to node to be issued
938 * @param first true if this is the first attempt to issue this node
939 * @return true if node was added to readyList
940 */
941 bool checkAndIssue(const GraphNode* node_ptr, bool first=true);
942
943 /** Get number of micro-ops modelled in the TraceCPU replay */
944 uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
945
946 private:
947 /** Reference of the TraceCPU. */
948 TraceCPU& owner;
949
950 /** Reference of the port to be used to issue memory requests. */
951 RequestPort& port;
952
953 /** RequestorID used for the requests being sent. */
954 const RequestorID requestorId;
955
956 /** Input stream used for reading the input trace file. */
957 InputStream trace;
958
959 /** String to store the name of the FixedRetryGen. */
960 std::string genName;
961
962 /** PacketPtr used to store the packet to retry. */
963 PacketPtr retryPkt;
964
965 /** Set to true when end of trace is reached. */
966 bool traceComplete;
967
968 /** Set to true when the next window of instructions need to be read */
969 bool nextRead;
970
971 /** Set true when execution of trace is complete */
972 bool execComplete;
973
974 /**
975 * Window size within which to check for dependencies. Its value is
976 * made equal to the window size used to generate the trace which is
977 * recorded in the trace header. The dependency graph must be
978 * populated enough such that when a node completes, its potential
979 * child node must be found and the dependency removed before the
980 * completed node itself is removed. Thus as soon as the graph shrinks
981 * to become smaller than this window, we read in the next window.
982 */
983 const uint32_t windowSize;
984
985 /**
986 * Hardware resources required to contain in-flight nodes and to
987 * throttle issuing of new nodes when resources are not available.
988 */
989 HardwareResource hwResource;
990
991 /** Store the depGraph of GraphNodes */
992 std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
993
994 /**
995 * Queue of dependency-free nodes that are pending issue because
996 * resources are not available. This is chosen to be FIFO so that
997 * dependent nodes which become free in program order get pushed
998 * into the queue in that order. Thus nodes are more likely to
999 * issue in program order.
1000 */
1001 std::queue<const GraphNode*> depFreeQueue;
1002
1003 /** List of nodes that are ready to execute */
1004 std::list<ReadyNode> readyList;
1005
1006 protected:
1007 // Defining the a stat group
1008 struct ElasticDataGenStatGroup : public Stats::Group
1009 {
1010 /** name is the extension to the name for these stats */
1011 ElasticDataGenStatGroup(Stats::Group *parent,
1012 const std::string& _name);
1013 /** Stats for data memory accesses replayed. */
1014 Stats::Scalar maxDependents;
1015 Stats::Scalar maxReadyListSize;
1016 Stats::Scalar numSendAttempted;
1017 Stats::Scalar numSendSucceeded;
1018 Stats::Scalar numSendFailed;
1019 Stats::Scalar numRetrySucceeded;
1020 Stats::Scalar numSplitReqs;
1021 Stats::Scalar numSOLoads;
1022 Stats::Scalar numSOStores;
1023 /** Tick when ElasticDataGen completes execution */
1024 Stats::Scalar dataLastTick;
1025 } elasticStats;
1026 };
1027
1028 /** Instance of FixedRetryGen to replay instruction read requests. */
1029 FixedRetryGen icacheGen;
1030
1031 /** Instance of ElasticDataGen to replay data read and write requests. */
1032 ElasticDataGen dcacheGen;
1033
1034 /**
1035 * This is the control flow that uses the functionality of the icacheGen to
1036 * replay the trace. It calls tryNext(). If it returns true then next event
1037 * is scheduled at curTick() plus delta. If it returns false then delta is
1038 * ignored and control is brought back via recvRetry().
1039 */
1040 void schedIcacheNext();
1041
1042 /**
1043 * This is the control flow that uses the functionality of the dcacheGen to
1044 * replay the trace. It calls execute(). It checks if execution is complete
1045 * and schedules an event to exit simulation accordingly.
1046 */
1047 void schedDcacheNext();
1048
1049 /** Event for the control flow method schedIcacheNext() */
1050 EventFunctionWrapper icacheNextEvent;
1051
1052 /** Event for the control flow method schedDcacheNext() */
1053 EventFunctionWrapper dcacheNextEvent;
1054
1055 /**
1056 * This is called when either generator finishes executing from the
1057 * trace.
1058 */
1059 void checkAndSchedExitEvent();
1060
1061 /** Set to true when one of the generators finishes replaying its trace. */
1062 bool oneTraceComplete;
1063
1064 /**
1065 * This stores the time offset in the trace, which is taken away from
1066 * the ready times of requests. This is specially useful because the time
1067 * offset can be very large if the traces are generated from the middle of
1068 * a program.
1069 */
1070 Tick traceOffset;
1071
1072 /**
1073 * Number of Trace CPUs in the system used as a shared variable and passed
1074 * to the CountedExitEvent event used for counting down exit events. It is
1075 * incremented in the constructor call so that the total is arrived at
1076 * automatically.
1077 */
1078 static int numTraceCPUs;
1079
1080 /**
1081 * A CountedExitEvent which when serviced decrements the counter. A sim
1082 * exit event is scheduled when the counter equals zero, that is all
1083 * instances of Trace CPU have had their execCompleteEvent serviced.
1084 */
1085 CountedExitEvent *execCompleteEvent;
1086
1087 /**
1088 * Exit when any one Trace CPU completes its execution. If this is
1089 * configured true then the execCompleteEvent is not scheduled.
1090 */
1091 const bool enableEarlyExit;
1092
1093 /**
1094 * Interval of committed instructions specified by the user at which a
1095 * progress info message is printed
1096 */
1097 const uint64_t progressMsgInterval;
1098
1099 /*
1100 * The progress msg threshold is kept updated to the next multiple of the
1101 * progress msg interval. As soon as the threshold is reached, an info
1102 * message is printed.
1103 */
1104 uint64_t progressMsgThreshold;
1105 struct TraceStats : public Stats::Group
1106 {
1107 TraceStats(TraceCPU *trace);
1108 Stats::Scalar numSchedDcacheEvent;
1109 Stats::Scalar numSchedIcacheEvent;
1110
1111 /** Stat for number of simulated micro-ops. */
1112 Stats::Scalar numOps;
1113 /** Stat for the CPI. This is really cycles per
1114 * micro-op and not inst. */
1115 Stats::Formula cpi;
1116 } traceStats;
1117
1118 public:
1119
1120 /** Used to get a reference to the icache port. */
1121 Port &getInstPort() { return icachePort; }
1122
1123 /** Used to get a reference to the dcache port. */
1124 Port &getDataPort() { return dcachePort; }
1125
1126 };
1127 #endif // __CPU_TRACE_TRACE_CPU_HH__