2 * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met: redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer;
9 * redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution;
12 * neither the name of the copyright holders nor the names of its
13 * contributors may be used to endorse or promote products derived from
14 * this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 This file has been modified by Kevin Moore and Dan Nussbaum of the
31 Scalable Systems Research Group at Sun Microsystems Laboratories
32 (http://research.sun.com/scalable/) to support the Adaptive
33 Transactional Memory Test Platform (ATMTP).
35 Please send email to atmtp-interest@sun.com with feedback, questions, or
36 to request future announcements about ATMTP.
38 ----------------------------------------------------------------------
40 File modification date: 2008-02-23
42 ----------------------------------------------------------------------
45 // Allows use of times() library call, which determines virtual runtime
46 #include <sys/resource.h>
47 #include <sys/times.h>
51 #include "base/stl_helpers.hh"
52 #include "base/str.hh"
53 #include "mem/gems_common/PrioHeap.hh"
54 #include "mem/protocol/CacheMsg.hh"
55 #include "mem/protocol/MachineType.hh"
56 #include "mem/protocol/Protocol.hh"
57 #include "mem/ruby/common/Debug.hh"
58 #include "mem/ruby/network/Network.hh"
59 #include "mem/ruby/profiler/AddressProfiler.hh"
60 #include "mem/ruby/profiler/Profiler.hh"
61 #include "mem/ruby/system/System.hh"
62 #include "mem/ruby/system/System.hh"
65 using m5::stl_helpers::operator<<;
67 extern ostream
* debug_cout_ptr
;
69 static double process_memory_total();
70 static double process_memory_resident();
72 Profiler::Profiler(const Params
*p
)
75 m_inst_profiler_ptr
= NULL
;
76 m_address_profiler_ptr
= NULL
;
78 m_real_time_start_time
= time(NULL
); // Not reset in clearStats()
79 m_stats_period
= 1000000; // Default
80 m_periodic_output_file_ptr
= &cerr
;
82 m_hot_lines
= p
->hot_lines
;
83 m_all_instructions
= p
->all_instructions
;
85 m_num_of_sequencers
= p
->num_of_sequencers
;
88 m_all_instructions
= false;
90 m_address_profiler_ptr
= new AddressProfiler(m_num_of_sequencers
);
91 m_address_profiler_ptr
->setHotLines(m_hot_lines
);
92 m_address_profiler_ptr
->setAllInstructions(m_all_instructions
);
94 if (m_all_instructions
) {
95 m_inst_profiler_ptr
= new AddressProfiler(m_num_of_sequencers
);
96 m_inst_profiler_ptr
->setHotLines(m_hot_lines
);
97 m_inst_profiler_ptr
->setAllInstructions(m_all_instructions
);
101 Profiler::~Profiler()
103 if (m_periodic_output_file_ptr
!= &cerr
) {
104 delete m_periodic_output_file_ptr
;
111 // FIXME - avoid the repeated code
113 vector
<integer_t
> perProcCycleCount(m_num_of_sequencers
);
115 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
116 perProcCycleCount
[i
] =
117 g_system_ptr
->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
118 // The +1 allows us to avoid division by zero
121 ostream
&out
= *m_periodic_output_file_ptr
;
123 out
<< "ruby_cycles: " << g_eventQueue_ptr
->getTime()-m_ruby_start
<< endl
124 << "mbytes_resident: " << process_memory_resident() << endl
125 << "mbytes_total: " << process_memory_total() << endl
;
127 if (process_memory_total() > 0) {
128 out
<< "resident_ratio: "
129 << process_memory_resident() / process_memory_total() << endl
;
132 out
<< "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
136 if (m_all_instructions
) {
137 m_inst_profiler_ptr
->printStats(out
);
140 //g_system_ptr->getNetwork()->printStats(out);
141 g_eventQueue_ptr
->scheduleEvent(this, m_stats_period
);
145 Profiler::setPeriodicStatsFile(const string
& filename
)
147 cout
<< "Recording periodic statistics to file '" << filename
<< "' every "
148 << m_stats_period
<< " Ruby cycles" << endl
;
150 if (m_periodic_output_file_ptr
!= &cerr
) {
151 delete m_periodic_output_file_ptr
;
154 m_periodic_output_file_ptr
= new ofstream(filename
.c_str());
155 g_eventQueue_ptr
->scheduleEvent(this, 1);
159 Profiler::setPeriodicStatsInterval(integer_t period
)
161 cout
<< "Recording periodic statistics every " << m_stats_period
162 << " Ruby cycles" << endl
;
164 m_stats_period
= period
;
165 g_eventQueue_ptr
->scheduleEvent(this, 1);
169 Profiler::printConfig(ostream
& out
) const
172 out
<< "Profiler Configuration" << endl
;
173 out
<< "----------------------" << endl
;
174 out
<< "periodic_stats_period: " << m_stats_period
<< endl
;
178 Profiler::print(ostream
& out
) const
184 Profiler::printStats(ostream
& out
, bool short_stats
)
190 out
<< "Profiler Stats" << endl
;
191 out
<< "--------------" << endl
;
193 time_t real_time_current
= time(NULL
);
194 double seconds
= difftime(real_time_current
, m_real_time_start_time
);
195 double minutes
= seconds
/ 60.0;
196 double hours
= minutes
/ 60.0;
197 double days
= hours
/ 24.0;
198 Time ruby_cycles
= g_eventQueue_ptr
->getTime()-m_ruby_start
;
201 out
<< "Elapsed_time_in_seconds: " << seconds
<< endl
;
202 out
<< "Elapsed_time_in_minutes: " << minutes
<< endl
;
203 out
<< "Elapsed_time_in_hours: " << hours
<< endl
;
204 out
<< "Elapsed_time_in_days: " << days
<< endl
;
208 // print the virtual runtimes as well
211 seconds
= (vtime
.tms_utime
+ vtime
.tms_stime
) / 100.0;
212 minutes
= seconds
/ 60.0;
213 hours
= minutes
/ 60.0;
215 out
<< "Virtual_time_in_seconds: " << seconds
<< endl
;
216 out
<< "Virtual_time_in_minutes: " << minutes
<< endl
;
217 out
<< "Virtual_time_in_hours: " << hours
<< endl
;
218 out
<< "Virtual_time_in_days: " << days
<< endl
;
221 out
<< "Ruby_current_time: " << g_eventQueue_ptr
->getTime() << endl
;
222 out
<< "Ruby_start_time: " << m_ruby_start
<< endl
;
223 out
<< "Ruby_cycles: " << ruby_cycles
<< endl
;
227 out
<< "mbytes_resident: " << process_memory_resident() << endl
;
228 out
<< "mbytes_total: " << process_memory_total() << endl
;
229 if (process_memory_total() > 0) {
230 out
<< "resident_ratio: "
231 << process_memory_resident()/process_memory_total() << endl
;
236 vector
<integer_t
> perProcCycleCount(m_num_of_sequencers
);
238 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
239 perProcCycleCount
[i
] =
240 g_system_ptr
->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
241 // The +1 allows us to avoid division by zero
244 out
<< "ruby_cycles_executed: " << perProcCycleCount
<< endl
;
249 out
<< "Busy Controller Counts:" << endl
;
250 for (int i
= 0; i
< MachineType_NUM
; i
++) {
251 int size
= MachineType_base_count((MachineType
)i
);
252 for (int j
= 0; j
< size
; j
++) {
254 machID
.type
= (MachineType
)i
;
256 out
<< machID
<< ":" << m_busyControllerCount
[i
][j
] << " ";
257 if ((j
+ 1) % 8 == 0) {
265 out
<< "Busy Bank Count:" << m_busyBankCount
<< endl
;
268 out
<< "sequencer_requests_outstanding: "
269 << m_sequencer_requests
<< endl
;
274 out
<< "All Non-Zero Cycle Demand Cache Accesses" << endl
;
275 out
<< "----------------------------------------" << endl
;
276 out
<< "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
277 for (int i
= 0; i
< m_missLatencyHistograms
.size(); i
++) {
278 if (m_missLatencyHistograms
[i
].size() > 0) {
279 out
<< "miss_latency_" << RubyRequestType(i
) << ": "
280 << m_missLatencyHistograms
[i
] << endl
;
283 for (int i
= 0; i
< m_machLatencyHistograms
.size(); i
++) {
284 if (m_machLatencyHistograms
[i
].size() > 0) {
285 out
<< "miss_latency_" << GenericMachineType(i
) << ": "
286 << m_machLatencyHistograms
[i
] << endl
;
292 out
<< "All Non-Zero Cycle SW Prefetch Requests" << endl
;
293 out
<< "------------------------------------" << endl
;
294 out
<< "prefetch_latency: " << m_allSWPrefetchLatencyHistogram
<< endl
;
295 for (int i
= 0; i
< m_SWPrefetchLatencyHistograms
.size(); i
++) {
296 if (m_SWPrefetchLatencyHistograms
[i
].size() > 0) {
297 out
<< "prefetch_latency_" << CacheRequestType(i
) << ": "
298 << m_SWPrefetchLatencyHistograms
[i
] << endl
;
301 for (int i
= 0; i
< m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
302 if (m_SWPrefetchMachLatencyHistograms
[i
].size() > 0) {
303 out
<< "prefetch_latency_" << GenericMachineType(i
) << ": "
304 << m_SWPrefetchMachLatencyHistograms
[i
] << endl
;
307 out
<< "prefetch_latency_L2Miss:"
308 << m_SWPrefetchL2MissLatencyHistogram
<< endl
;
310 if (m_all_sharing_histogram
.size() > 0) {
311 out
<< "all_sharing: " << m_all_sharing_histogram
<< endl
;
312 out
<< "read_sharing: " << m_read_sharing_histogram
<< endl
;
313 out
<< "write_sharing: " << m_write_sharing_histogram
<< endl
;
315 out
<< "all_sharing_percent: ";
316 m_all_sharing_histogram
.printPercent(out
);
319 out
<< "read_sharing_percent: ";
320 m_read_sharing_histogram
.printPercent(out
);
323 out
<< "write_sharing_percent: ";
324 m_write_sharing_histogram
.printPercent(out
);
327 int64 total_miss
= m_cache_to_cache
+ m_memory_to_cache
;
328 out
<< "all_misses: " << total_miss
<< endl
;
329 out
<< "cache_to_cache_misses: " << m_cache_to_cache
<< endl
;
330 out
<< "memory_to_cache_misses: " << m_memory_to_cache
<< endl
;
331 out
<< "cache_to_cache_percent: "
332 << 100.0 * (double(m_cache_to_cache
) / double(total_miss
))
334 out
<< "memory_to_cache_percent: "
335 << 100.0 * (double(m_memory_to_cache
) / double(total_miss
))
340 if (m_outstanding_requests
.size() > 0) {
341 out
<< "outstanding_requests: ";
342 m_outstanding_requests
.printPercent(out
);
349 out
<< "Request vs. RubySystem State Profile" << endl
;
350 out
<< "--------------------------------" << endl
;
353 map
<string
, int>::const_iterator i
= m_requestProfileMap
.begin();
354 map
<string
, int>::const_iterator end
= m_requestProfileMap
.end();
355 for (; i
!= end
; ++i
) {
356 const string
&key
= i
->first
;
357 int count
= i
->second
;
359 double percent
= (100.0 * double(count
)) / double(m_requests
);
360 vector
<string
> items
;
361 tokenize(items
, key
, ':');
362 vector
<string
>::iterator j
= items
.begin();
363 vector
<string
>::iterator end
= items
.end();
364 for (; j
!= end
; ++i
)
365 out
<< setw(10) << *j
;
366 out
<< setw(11) << count
;
367 out
<< setw(14) << percent
<< endl
;
371 out
<< "filter_action: " << m_filter_action_histogram
<< endl
;
373 if (!m_all_instructions
) {
374 m_address_profiler_ptr
->printStats(out
);
377 if (m_all_instructions
) {
378 m_inst_profiler_ptr
->printStats(out
);
382 out
<< "Message Delayed Cycles" << endl
;
383 out
<< "----------------------" << endl
;
384 out
<< "Total_delay_cycles: " << m_delayedCyclesHistogram
<< endl
;
385 out
<< "Total_nonPF_delay_cycles: "
386 << m_delayedCyclesNonPFHistogram
<< endl
;
387 for (int i
= 0; i
< m_delayedCyclesVCHistograms
.size(); i
++) {
388 out
<< " virtual_network_" << i
<< "_delay_cycles: "
389 << m_delayedCyclesVCHistograms
[i
] << endl
;
392 printResourceUsage(out
);
397 Profiler::printResourceUsage(ostream
& out
) const
400 out
<< "Resource Usage" << endl
;
401 out
<< "--------------" << endl
;
403 integer_t pagesize
= getpagesize(); // page size in bytes
404 out
<< "page_size: " << pagesize
<< endl
;
407 getrusage (RUSAGE_SELF
, &usage
);
409 out
<< "user_time: " << usage
.ru_utime
.tv_sec
<< endl
;
410 out
<< "system_time: " << usage
.ru_stime
.tv_sec
<< endl
;
411 out
<< "page_reclaims: " << usage
.ru_minflt
<< endl
;
412 out
<< "page_faults: " << usage
.ru_majflt
<< endl
;
413 out
<< "swaps: " << usage
.ru_nswap
<< endl
;
414 out
<< "block_inputs: " << usage
.ru_inblock
<< endl
;
415 out
<< "block_outputs: " << usage
.ru_oublock
<< endl
;
419 Profiler::clearStats()
421 m_ruby_start
= g_eventQueue_ptr
->getTime();
423 m_cycles_executed_at_start
.resize(m_num_of_sequencers
);
424 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
425 if (g_system_ptr
== NULL
) {
426 m_cycles_executed_at_start
[i
] = 0;
428 m_cycles_executed_at_start
[i
] = g_system_ptr
->getCycleCount(i
);
432 m_busyControllerCount
.resize(MachineType_NUM
); // all machines
433 for (int i
= 0; i
< MachineType_NUM
; i
++) {
434 int size
= MachineType_base_count((MachineType
)i
);
435 m_busyControllerCount
[i
].resize(size
);
436 for (int j
= 0; j
< size
; j
++) {
437 m_busyControllerCount
[i
][j
] = 0;
442 m_delayedCyclesHistogram
.clear();
443 m_delayedCyclesNonPFHistogram
.clear();
444 int size
= RubySystem::getNetwork()->getNumberOfVirtualNetworks();
445 m_delayedCyclesVCHistograms
.resize(size
);
446 for (int i
= 0; i
< size
; i
++) {
447 m_delayedCyclesVCHistograms
[i
].clear();
450 m_missLatencyHistograms
.resize(RubyRequestType_NUM
);
451 for (int i
= 0; i
< m_missLatencyHistograms
.size(); i
++) {
452 m_missLatencyHistograms
[i
].clear(200);
454 m_machLatencyHistograms
.resize(GenericMachineType_NUM
+1);
455 for (int i
= 0; i
< m_machLatencyHistograms
.size(); i
++) {
456 m_machLatencyHistograms
[i
].clear(200);
458 m_allMissLatencyHistogram
.clear(200);
460 m_SWPrefetchLatencyHistograms
.resize(CacheRequestType_NUM
);
461 for (int i
= 0; i
< m_SWPrefetchLatencyHistograms
.size(); i
++) {
462 m_SWPrefetchLatencyHistograms
[i
].clear(200);
464 m_SWPrefetchMachLatencyHistograms
.resize(GenericMachineType_NUM
+1);
465 for (int i
= 0; i
< m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
466 m_SWPrefetchMachLatencyHistograms
[i
].clear(200);
468 m_allSWPrefetchLatencyHistogram
.clear(200);
470 m_sequencer_requests
.clear();
471 m_read_sharing_histogram
.clear();
472 m_write_sharing_histogram
.clear();
473 m_all_sharing_histogram
.clear();
474 m_cache_to_cache
= 0;
475 m_memory_to_cache
= 0;
478 m_requestProfileMap
.clear();
480 // count requests profiled
483 m_outstanding_requests
.clear();
484 m_outstanding_persistent_requests
.clear();
486 // Flush the prefetches through the system - used so that there
487 // are no outstanding requests after stats are cleared
488 //g_eventQueue_ptr->triggerAllEvents();
490 // update the start time
491 m_ruby_start
= g_eventQueue_ptr
->getTime();
495 Profiler::addAddressTraceSample(const CacheMsg
& msg
, NodeID id
)
497 if (msg
.getType() != CacheRequestType_IFETCH
) {
498 // Note: The following line should be commented out if you
499 // want to use the special profiling that is part of the GS320
502 // NOTE: Unless PROFILE_HOT_LINES is enabled, nothing will be
503 // profiled by the AddressProfiler
504 m_address_profiler_ptr
->
505 addTraceSample(msg
.getLineAddress(), msg
.getProgramCounter(),
506 msg
.getType(), msg
.getAccessMode(), id
, false);
511 Profiler::profileSharing(const Address
& addr
, AccessType type
,
512 NodeID requestor
, const Set
& sharers
,
515 Set
set_contacted(owner
);
516 if (type
== AccessType_Write
) {
517 set_contacted
.addSet(sharers
);
519 set_contacted
.remove(requestor
);
520 int number_contacted
= set_contacted
.count();
522 if (type
== AccessType_Write
) {
523 m_write_sharing_histogram
.add(number_contacted
);
525 m_read_sharing_histogram
.add(number_contacted
);
527 m_all_sharing_histogram
.add(number_contacted
);
529 if (number_contacted
== 0) {
537 Profiler::profileMsgDelay(int virtualNetwork
, int delayCycles
)
539 assert(virtualNetwork
< m_delayedCyclesVCHistograms
.size());
540 m_delayedCyclesHistogram
.add(delayCycles
);
541 m_delayedCyclesVCHistograms
[virtualNetwork
].add(delayCycles
);
542 if (virtualNetwork
!= 0) {
543 m_delayedCyclesNonPFHistogram
.add(delayCycles
);
547 // profiles original cache requests including PUTs
549 Profiler::profileRequest(const string
& requestStr
)
553 // if it doesn't exist, conveniently, it will be created with the
554 // default value which is 0
555 m_requestProfileMap
[requestStr
]++;
559 Profiler::controllerBusy(MachineID machID
)
561 m_busyControllerCount
[(int)machID
.type
][(int)machID
.num
]++;
565 Profiler::profilePFWait(Time waitTime
)
567 m_prefetchWaitHistogram
.add(waitTime
);
576 // non-zero cycle demand request
578 Profiler::missLatency(Time t
, RubyRequestType type
)
580 m_allMissLatencyHistogram
.add(t
);
581 m_missLatencyHistograms
[type
].add(t
);
584 // non-zero cycle prefetch request
586 Profiler::swPrefetchLatency(Time t
, CacheRequestType type
,
587 GenericMachineType respondingMach
)
589 m_allSWPrefetchLatencyHistogram
.add(t
);
590 m_SWPrefetchLatencyHistograms
[type
].add(t
);
591 m_SWPrefetchMachLatencyHistograms
[respondingMach
].add(t
);
592 if (respondingMach
== GenericMachineType_Directory
||
593 respondingMach
== GenericMachineType_NUM
) {
594 m_SWPrefetchL2MissLatencyHistogram
.add(t
);
599 Profiler::profileTransition(const string
& component
, NodeID version
,
600 Address addr
, const string
& state
, const string
& event
,
601 const string
& next_state
, const string
& note
)
603 const int EVENT_SPACES
= 20;
604 const int ID_SPACES
= 3;
605 const int TIME_SPACES
= 7;
606 const int COMP_SPACES
= 10;
607 const int STATE_SPACES
= 6;
609 if (g_debug_ptr
->getDebugTime() <= 0 ||
610 g_eventQueue_ptr
->getTime() < g_debug_ptr
->getDebugTime())
613 ostream
&out
= *debug_cout_ptr
;
614 out
.flags(ios::right
);
615 out
<< setw(TIME_SPACES
) << g_eventQueue_ptr
->getTime() << " ";
616 out
<< setw(ID_SPACES
) << version
<< " ";
617 out
<< setw(COMP_SPACES
) << component
;
618 out
<< setw(EVENT_SPACES
) << event
<< " ";
620 out
.flags(ios::right
);
621 out
<< setw(STATE_SPACES
) << state
;
623 out
.flags(ios::left
);
624 out
<< setw(STATE_SPACES
) << next_state
;
626 out
<< " " << addr
<< " " << note
;
633 process_memory_total()
635 // 4kB page size, 1024*1024 bytes per MB,
636 const double MULTIPLIER
= 4096.0 / (1024.0 * 1024.0);
638 proc_file
.open("/proc/self/statm");
639 int total_size_in_pages
= 0;
640 int res_size_in_pages
= 0;
641 proc_file
>> total_size_in_pages
;
642 proc_file
>> res_size_in_pages
;
643 return double(total_size_in_pages
) * MULTIPLIER
; // size in megabytes
647 process_memory_resident()
649 // 4kB page size, 1024*1024 bytes per MB,
650 const double MULTIPLIER
= 4096.0 / (1024.0 * 1024.0);
652 proc_file
.open("/proc/self/statm");
653 int total_size_in_pages
= 0;
654 int res_size_in_pages
= 0;
655 proc_file
>> total_size_in_pages
;
656 proc_file
>> res_size_in_pages
;
657 return double(res_size_in_pages
) * MULTIPLIER
; // size in megabytes
661 Profiler::rubyWatch(int id
)
664 Address watch_address
= Address(tr
);
665 const int ID_SPACES
= 3;
666 const int TIME_SPACES
= 7;
668 ostream
&out
= *debug_cout_ptr
;
670 out
.flags(ios::right
);
671 out
<< setw(TIME_SPACES
) << g_eventQueue_ptr
->getTime() << " ";
672 out
<< setw(ID_SPACES
) << id
<< " "
673 << "RUBY WATCH " << watch_address
<< endl
;
675 // don't care about success or failure
676 m_watch_address_set
.insert(watch_address
);
680 Profiler::watchAddress(Address addr
)
682 return m_watch_address_set
.count(addr
) > 0;
686 RubyProfilerParams::create()
688 return new Profiler(this);