2 * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met: redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer;
9 * redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution;
12 * neither the name of the copyright holders nor the names of its
13 * contributors may be used to endorse or promote products derived from
14 * this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 This file has been modified by Kevin Moore and Dan Nussbaum of the
31 Scalable Systems Research Group at Sun Microsystems Laboratories
32 (http://research.sun.com/scalable/) to support the Adaptive
33 Transactional Memory Test Platform (ATMTP).
35 Please send email to atmtp-interest@sun.com with feedback, questions, or
36 to request future announcements about ATMTP.
38 ----------------------------------------------------------------------
40 File modification date: 2008-02-23
42 ----------------------------------------------------------------------
45 // Allows use of times() library call, which determines virtual runtime
46 #include <sys/resource.h>
47 #include <sys/times.h>
49 #include "mem/gems_common/Map.hh"
50 #include "mem/gems_common/PrioHeap.hh"
51 #include "mem/gems_common/util.hh"
52 #include "mem/protocol/CacheMsg.hh"
53 #include "mem/protocol/MachineType.hh"
54 #include "mem/protocol/Protocol.hh"
55 #include "mem/ruby/common/Debug.hh"
56 #include "mem/ruby/network/Network.hh"
57 #include "mem/ruby/profiler/AddressProfiler.hh"
58 #include "mem/ruby/profiler/Profiler.hh"
59 #include "mem/ruby/system/System.hh"
60 #include "mem/ruby/system/System.hh"
62 extern std::ostream
* debug_cout_ptr
;
64 static double process_memory_total();
65 static double process_memory_resident();
67 Profiler::Profiler(const Params
*p
)
70 m_requestProfileMap_ptr
= new Map
<string
, int>;
72 m_inst_profiler_ptr
= NULL
;
73 m_address_profiler_ptr
= NULL
;
75 m_real_time_start_time
= time(NULL
); // Not reset in clearStats()
76 m_stats_period
= 1000000; // Default
77 m_periodic_output_file_ptr
= &cerr
;
79 m_hot_lines
= p
->hot_lines
;
80 m_all_instructions
= p
->all_instructions
;
82 m_num_of_sequencers
= p
->num_of_sequencers
;
85 m_all_instructions
= false;
87 m_address_profiler_ptr
= new AddressProfiler(m_num_of_sequencers
);
88 m_address_profiler_ptr
->setHotLines(m_hot_lines
);
89 m_address_profiler_ptr
->setAllInstructions(m_all_instructions
);
91 if (m_all_instructions
) {
92 m_inst_profiler_ptr
= new AddressProfiler(m_num_of_sequencers
);
93 m_inst_profiler_ptr
->setHotLines(m_hot_lines
);
94 m_inst_profiler_ptr
->setAllInstructions(m_all_instructions
);
100 if (m_periodic_output_file_ptr
!= &cerr
) {
101 delete m_periodic_output_file_ptr
;
104 delete m_requestProfileMap_ptr
;
110 // FIXME - avoid the repeated code
112 Vector
<integer_t
> perProcCycleCount
;
113 perProcCycleCount
.setSize(m_num_of_sequencers
);
115 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
116 perProcCycleCount
[i
] =
117 g_system_ptr
->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
118 // The +1 allows us to avoid division by zero
121 ostream
&out
= *m_periodic_output_file_ptr
;
123 out
<< "ruby_cycles: " << g_eventQueue_ptr
->getTime()-m_ruby_start
<< endl
124 << "mbytes_resident: " << process_memory_resident() << endl
125 << "mbytes_total: " << process_memory_total() << endl
;
127 if (process_memory_total() > 0) {
128 out
<< "resident_ratio: "
129 << process_memory_resident() / process_memory_total() << endl
;
132 out
<< "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
136 if (m_all_instructions
) {
137 m_inst_profiler_ptr
->printStats(out
);
140 //g_system_ptr->getNetwork()->printStats(out);
141 g_eventQueue_ptr
->scheduleEvent(this, m_stats_period
);
145 Profiler::setPeriodicStatsFile(const string
& filename
)
147 cout
<< "Recording periodic statistics to file '" << filename
<< "' every "
148 << m_stats_period
<< " Ruby cycles" << endl
;
150 if (m_periodic_output_file_ptr
!= &cerr
) {
151 delete m_periodic_output_file_ptr
;
154 m_periodic_output_file_ptr
= new ofstream(filename
.c_str());
155 g_eventQueue_ptr
->scheduleEvent(this, 1);
159 Profiler::setPeriodicStatsInterval(integer_t period
)
161 cout
<< "Recording periodic statistics every " << m_stats_period
162 << " Ruby cycles" << endl
;
164 m_stats_period
= period
;
165 g_eventQueue_ptr
->scheduleEvent(this, 1);
169 Profiler::printConfig(ostream
& out
) const
172 out
<< "Profiler Configuration" << endl
;
173 out
<< "----------------------" << endl
;
174 out
<< "periodic_stats_period: " << m_stats_period
<< endl
;
178 Profiler::print(ostream
& out
) const
184 Profiler::printStats(ostream
& out
, bool short_stats
)
190 out
<< "Profiler Stats" << endl
;
191 out
<< "--------------" << endl
;
193 time_t real_time_current
= time(NULL
);
194 double seconds
= difftime(real_time_current
, m_real_time_start_time
);
195 double minutes
= seconds
/ 60.0;
196 double hours
= minutes
/ 60.0;
197 double days
= hours
/ 24.0;
198 Time ruby_cycles
= g_eventQueue_ptr
->getTime()-m_ruby_start
;
201 out
<< "Elapsed_time_in_seconds: " << seconds
<< endl
;
202 out
<< "Elapsed_time_in_minutes: " << minutes
<< endl
;
203 out
<< "Elapsed_time_in_hours: " << hours
<< endl
;
204 out
<< "Elapsed_time_in_days: " << days
<< endl
;
208 // print the virtual runtimes as well
211 seconds
= (vtime
.tms_utime
+ vtime
.tms_stime
) / 100.0;
212 minutes
= seconds
/ 60.0;
213 hours
= minutes
/ 60.0;
215 out
<< "Virtual_time_in_seconds: " << seconds
<< endl
;
216 out
<< "Virtual_time_in_minutes: " << minutes
<< endl
;
217 out
<< "Virtual_time_in_hours: " << hours
<< endl
;
218 out
<< "Virtual_time_in_days: " << days
<< endl
;
221 out
<< "Ruby_current_time: " << g_eventQueue_ptr
->getTime() << endl
;
222 out
<< "Ruby_start_time: " << m_ruby_start
<< endl
;
223 out
<< "Ruby_cycles: " << ruby_cycles
<< endl
;
227 out
<< "mbytes_resident: " << process_memory_resident() << endl
;
228 out
<< "mbytes_total: " << process_memory_total() << endl
;
229 if (process_memory_total() > 0) {
230 out
<< "resident_ratio: "
231 << process_memory_resident()/process_memory_total() << endl
;
236 Vector
<integer_t
> perProcCycleCount
;
237 perProcCycleCount
.setSize(m_num_of_sequencers
);
239 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
240 perProcCycleCount
[i
] =
241 g_system_ptr
->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
242 // The +1 allows us to avoid division by zero
245 out
<< "ruby_cycles_executed: " << perProcCycleCount
<< endl
;
250 out
<< "Busy Controller Counts:" << endl
;
251 for (int i
= 0; i
< MachineType_NUM
; i
++) {
252 int size
= MachineType_base_count((MachineType
)i
);
253 for (int j
= 0; j
< size
; j
++) {
255 machID
.type
= (MachineType
)i
;
257 out
<< machID
<< ":" << m_busyControllerCount
[i
][j
] << " ";
258 if ((j
+ 1) % 8 == 0) {
266 out
<< "Busy Bank Count:" << m_busyBankCount
<< endl
;
269 out
<< "sequencer_requests_outstanding: "
270 << m_sequencer_requests
<< endl
;
275 out
<< "All Non-Zero Cycle Demand Cache Accesses" << endl
;
276 out
<< "----------------------------------------" << endl
;
277 out
<< "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
278 for (int i
= 0; i
< m_missLatencyHistograms
.size(); i
++) {
279 if (m_missLatencyHistograms
[i
].size() > 0) {
280 out
<< "miss_latency_" << RubyRequestType(i
) << ": "
281 << m_missLatencyHistograms
[i
] << endl
;
284 for (int i
= 0; i
< m_machLatencyHistograms
.size(); i
++) {
285 if (m_machLatencyHistograms
[i
].size() > 0) {
286 out
<< "miss_latency_" << GenericMachineType(i
) << ": "
287 << m_machLatencyHistograms
[i
] << endl
;
293 out
<< "All Non-Zero Cycle SW Prefetch Requests" << endl
;
294 out
<< "------------------------------------" << endl
;
295 out
<< "prefetch_latency: " << m_allSWPrefetchLatencyHistogram
<< endl
;
296 for (int i
= 0; i
< m_SWPrefetchLatencyHistograms
.size(); i
++) {
297 if (m_SWPrefetchLatencyHistograms
[i
].size() > 0) {
298 out
<< "prefetch_latency_" << CacheRequestType(i
) << ": "
299 << m_SWPrefetchLatencyHistograms
[i
] << endl
;
302 for (int i
= 0; i
< m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
303 if (m_SWPrefetchMachLatencyHistograms
[i
].size() > 0) {
304 out
<< "prefetch_latency_" << GenericMachineType(i
) << ": "
305 << m_SWPrefetchMachLatencyHistograms
[i
] << endl
;
308 out
<< "prefetch_latency_L2Miss:"
309 << m_SWPrefetchL2MissLatencyHistogram
<< endl
;
311 if (m_all_sharing_histogram
.size() > 0) {
312 out
<< "all_sharing: " << m_all_sharing_histogram
<< endl
;
313 out
<< "read_sharing: " << m_read_sharing_histogram
<< endl
;
314 out
<< "write_sharing: " << m_write_sharing_histogram
<< endl
;
316 out
<< "all_sharing_percent: ";
317 m_all_sharing_histogram
.printPercent(out
);
320 out
<< "read_sharing_percent: ";
321 m_read_sharing_histogram
.printPercent(out
);
324 out
<< "write_sharing_percent: ";
325 m_write_sharing_histogram
.printPercent(out
);
328 int64 total_miss
= m_cache_to_cache
+ m_memory_to_cache
;
329 out
<< "all_misses: " << total_miss
<< endl
;
330 out
<< "cache_to_cache_misses: " << m_cache_to_cache
<< endl
;
331 out
<< "memory_to_cache_misses: " << m_memory_to_cache
<< endl
;
332 out
<< "cache_to_cache_percent: "
333 << 100.0 * (double(m_cache_to_cache
) / double(total_miss
))
335 out
<< "memory_to_cache_percent: "
336 << 100.0 * (double(m_memory_to_cache
) / double(total_miss
))
341 if (m_outstanding_requests
.size() > 0) {
342 out
<< "outstanding_requests: ";
343 m_outstanding_requests
.printPercent(out
);
350 out
<< "Request vs. RubySystem State Profile" << endl
;
351 out
<< "--------------------------------" << endl
;
354 Vector
<string
> requestProfileKeys
= m_requestProfileMap_ptr
->keys();
355 requestProfileKeys
.sortVector();
357 for (int i
= 0; i
< requestProfileKeys
.size(); i
++) {
359 m_requestProfileMap_ptr
->lookup(requestProfileKeys
[i
]);
360 double percent
= (100.0 * double(temp_int
)) / double(m_requests
);
361 while (requestProfileKeys
[i
] != "") {
362 out
<< setw(10) << string_split(requestProfileKeys
[i
], ':');
364 out
<< setw(11) << temp_int
;
365 out
<< setw(14) << percent
<< endl
;
369 out
<< "filter_action: " << m_filter_action_histogram
<< endl
;
371 if (!m_all_instructions
) {
372 m_address_profiler_ptr
->printStats(out
);
375 if (m_all_instructions
) {
376 m_inst_profiler_ptr
->printStats(out
);
380 out
<< "Message Delayed Cycles" << endl
;
381 out
<< "----------------------" << endl
;
382 out
<< "Total_delay_cycles: " << m_delayedCyclesHistogram
<< endl
;
383 out
<< "Total_nonPF_delay_cycles: "
384 << m_delayedCyclesNonPFHistogram
<< endl
;
385 for (int i
= 0; i
< m_delayedCyclesVCHistograms
.size(); i
++) {
386 out
<< " virtual_network_" << i
<< "_delay_cycles: "
387 << m_delayedCyclesVCHistograms
[i
] << endl
;
390 printResourceUsage(out
);
395 Profiler::printResourceUsage(ostream
& out
) const
398 out
<< "Resource Usage" << endl
;
399 out
<< "--------------" << endl
;
401 integer_t pagesize
= getpagesize(); // page size in bytes
402 out
<< "page_size: " << pagesize
<< endl
;
405 getrusage (RUSAGE_SELF
, &usage
);
407 out
<< "user_time: " << usage
.ru_utime
.tv_sec
<< endl
;
408 out
<< "system_time: " << usage
.ru_stime
.tv_sec
<< endl
;
409 out
<< "page_reclaims: " << usage
.ru_minflt
<< endl
;
410 out
<< "page_faults: " << usage
.ru_majflt
<< endl
;
411 out
<< "swaps: " << usage
.ru_nswap
<< endl
;
412 out
<< "block_inputs: " << usage
.ru_inblock
<< endl
;
413 out
<< "block_outputs: " << usage
.ru_oublock
<< endl
;
417 Profiler::clearStats()
419 m_ruby_start
= g_eventQueue_ptr
->getTime();
421 m_cycles_executed_at_start
.setSize(m_num_of_sequencers
);
422 for (int i
= 0; i
< m_num_of_sequencers
; i
++) {
423 if (g_system_ptr
== NULL
) {
424 m_cycles_executed_at_start
[i
] = 0;
426 m_cycles_executed_at_start
[i
] = g_system_ptr
->getCycleCount(i
);
430 m_busyControllerCount
.setSize(MachineType_NUM
); // all machines
431 for (int i
= 0; i
< MachineType_NUM
; i
++) {
432 int size
= MachineType_base_count((MachineType
)i
);
433 m_busyControllerCount
[i
].setSize(size
);
434 for (int j
= 0; j
< size
; j
++) {
435 m_busyControllerCount
[i
][j
] = 0;
440 m_delayedCyclesHistogram
.clear();
441 m_delayedCyclesNonPFHistogram
.clear();
442 int size
= RubySystem::getNetwork()->getNumberOfVirtualNetworks();
443 m_delayedCyclesVCHistograms
.setSize(size
);
444 for (int i
= 0; i
< size
; i
++) {
445 m_delayedCyclesVCHistograms
[i
].clear();
448 m_missLatencyHistograms
.setSize(RubyRequestType_NUM
);
449 for (int i
= 0; i
< m_missLatencyHistograms
.size(); i
++) {
450 m_missLatencyHistograms
[i
].clear(200);
452 m_machLatencyHistograms
.setSize(GenericMachineType_NUM
+1);
453 for (int i
= 0; i
< m_machLatencyHistograms
.size(); i
++) {
454 m_machLatencyHistograms
[i
].clear(200);
456 m_allMissLatencyHistogram
.clear(200);
458 m_SWPrefetchLatencyHistograms
.setSize(CacheRequestType_NUM
);
459 for (int i
= 0; i
< m_SWPrefetchLatencyHistograms
.size(); i
++) {
460 m_SWPrefetchLatencyHistograms
[i
].clear(200);
462 m_SWPrefetchMachLatencyHistograms
.setSize(GenericMachineType_NUM
+1);
463 for (int i
= 0; i
< m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
464 m_SWPrefetchMachLatencyHistograms
[i
].clear(200);
466 m_allSWPrefetchLatencyHistogram
.clear(200);
468 m_sequencer_requests
.clear();
469 m_read_sharing_histogram
.clear();
470 m_write_sharing_histogram
.clear();
471 m_all_sharing_histogram
.clear();
472 m_cache_to_cache
= 0;
473 m_memory_to_cache
= 0;
476 m_requestProfileMap_ptr
->clear();
478 // count requests profiled
481 m_outstanding_requests
.clear();
482 m_outstanding_persistent_requests
.clear();
484 // Flush the prefetches through the system - used so that there
485 // are no outstanding requests after stats are cleared
486 //g_eventQueue_ptr->triggerAllEvents();
488 // update the start time
489 m_ruby_start
= g_eventQueue_ptr
->getTime();
493 Profiler::addAddressTraceSample(const CacheMsg
& msg
, NodeID id
)
495 if (msg
.getType() != CacheRequestType_IFETCH
) {
496 // Note: The following line should be commented out if you
497 // want to use the special profiling that is part of the GS320
500 // NOTE: Unless PROFILE_HOT_LINES is enabled, nothing will be
501 // profiled by the AddressProfiler
502 m_address_profiler_ptr
->
503 addTraceSample(msg
.getLineAddress(), msg
.getProgramCounter(),
504 msg
.getType(), msg
.getAccessMode(), id
, false);
509 Profiler::profileSharing(const Address
& addr
, AccessType type
,
510 NodeID requestor
, const Set
& sharers
,
513 Set
set_contacted(owner
);
514 if (type
== AccessType_Write
) {
515 set_contacted
.addSet(sharers
);
517 set_contacted
.remove(requestor
);
518 int number_contacted
= set_contacted
.count();
520 if (type
== AccessType_Write
) {
521 m_write_sharing_histogram
.add(number_contacted
);
523 m_read_sharing_histogram
.add(number_contacted
);
525 m_all_sharing_histogram
.add(number_contacted
);
527 if (number_contacted
== 0) {
535 Profiler::profileMsgDelay(int virtualNetwork
, int delayCycles
)
537 assert(virtualNetwork
< m_delayedCyclesVCHistograms
.size());
538 m_delayedCyclesHistogram
.add(delayCycles
);
539 m_delayedCyclesVCHistograms
[virtualNetwork
].add(delayCycles
);
540 if (virtualNetwork
!= 0) {
541 m_delayedCyclesNonPFHistogram
.add(delayCycles
);
545 // profiles original cache requests including PUTs
547 Profiler::profileRequest(const string
& requestStr
)
551 if (m_requestProfileMap_ptr
->exist(requestStr
)) {
552 (m_requestProfileMap_ptr
->lookup(requestStr
))++;
554 m_requestProfileMap_ptr
->add(requestStr
, 1);
559 Profiler::controllerBusy(MachineID machID
)
561 m_busyControllerCount
[(int)machID
.type
][(int)machID
.num
]++;
565 Profiler::profilePFWait(Time waitTime
)
567 m_prefetchWaitHistogram
.add(waitTime
);
576 // non-zero cycle demand request
578 Profiler::missLatency(Time t
, RubyRequestType type
)
580 m_allMissLatencyHistogram
.add(t
);
581 m_missLatencyHistograms
[type
].add(t
);
584 // non-zero cycle prefetch request
586 Profiler::swPrefetchLatency(Time t
, CacheRequestType type
,
587 GenericMachineType respondingMach
)
589 m_allSWPrefetchLatencyHistogram
.add(t
);
590 m_SWPrefetchLatencyHistograms
[type
].add(t
);
591 m_SWPrefetchMachLatencyHistograms
[respondingMach
].add(t
);
592 if (respondingMach
== GenericMachineType_Directory
||
593 respondingMach
== GenericMachineType_NUM
) {
594 m_SWPrefetchL2MissLatencyHistogram
.add(t
);
599 Profiler::profileTransition(const string
& component
, NodeID version
,
600 Address addr
, const string
& state
, const string
& event
,
601 const string
& next_state
, const string
& note
)
603 const int EVENT_SPACES
= 20;
604 const int ID_SPACES
= 3;
605 const int TIME_SPACES
= 7;
606 const int COMP_SPACES
= 10;
607 const int STATE_SPACES
= 6;
609 if (g_debug_ptr
->getDebugTime() <= 0 ||
610 g_eventQueue_ptr
->getTime() < g_debug_ptr
->getDebugTime())
613 ostream
&out
= *debug_cout_ptr
;
614 out
.flags(ios::right
);
615 out
<< setw(TIME_SPACES
) << g_eventQueue_ptr
->getTime() << " ";
616 out
<< setw(ID_SPACES
) << version
<< " ";
617 out
<< setw(COMP_SPACES
) << component
;
618 out
<< setw(EVENT_SPACES
) << event
<< " ";
620 out
.flags(ios::right
);
621 out
<< setw(STATE_SPACES
) << state
;
623 out
.flags(ios::left
);
624 out
<< setw(STATE_SPACES
) << next_state
;
626 out
<< " " << addr
<< " " << note
;
633 process_memory_total()
635 // 4kB page size, 1024*1024 bytes per MB,
636 const double MULTIPLIER
= 4096.0 / (1024.0 * 1024.0);
638 proc_file
.open("/proc/self/statm");
639 int total_size_in_pages
= 0;
640 int res_size_in_pages
= 0;
641 proc_file
>> total_size_in_pages
;
642 proc_file
>> res_size_in_pages
;
643 return double(total_size_in_pages
) * MULTIPLIER
; // size in megabytes
647 process_memory_resident()
649 // 4kB page size, 1024*1024 bytes per MB,
650 const double MULTIPLIER
= 4096.0 / (1024.0 * 1024.0);
652 proc_file
.open("/proc/self/statm");
653 int total_size_in_pages
= 0;
654 int res_size_in_pages
= 0;
655 proc_file
>> total_size_in_pages
;
656 proc_file
>> res_size_in_pages
;
657 return double(res_size_in_pages
) * MULTIPLIER
; // size in megabytes
661 Profiler::rubyWatch(int id
)
664 Address watch_address
= Address(tr
);
665 const int ID_SPACES
= 3;
666 const int TIME_SPACES
= 7;
668 ostream
&out
= *debug_cout_ptr
;
670 out
.flags(ios::right
);
671 out
<< setw(TIME_SPACES
) << g_eventQueue_ptr
->getTime() << " ";
672 out
<< setw(ID_SPACES
) << id
<< " "
673 << "RUBY WATCH " << watch_address
<< endl
;
675 if (!m_watch_address_list_ptr
->exist(watch_address
)) {
676 m_watch_address_list_ptr
->add(watch_address
, 1);
681 Profiler::watchAddress(Address addr
)
683 if (m_watch_address_list_ptr
->exist(addr
))
690 RubyProfilerParams::create()
692 return new Profiler(this);