2 * Copyright (c) 1999-2008 Mark D. Hill and David A. Wood
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met: redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer;
9 * redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution;
12 * neither the name of the copyright holders nor the names of its
13 * contributors may be used to endorse or promote products derived from
14 * this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 This file has been modified by Kevin Moore and Dan Nussbaum of the
31 Scalable Systems Research Group at Sun Microsystems Laboratories
32 (http://research.sun.com/scalable/) to support the Adaptive
33 Transactional Memory Test Platform (ATMTP).
35 Please send email to atmtp-interest@sun.com with feedback, questions, or
36 to request future announcements about ATMTP.
38 ----------------------------------------------------------------------
40 File modification date: 2008-02-23
42 ----------------------------------------------------------------------
48 * Description: See Profiler.h
54 #include "mem/ruby/profiler/Profiler.hh"
55 #include "mem/ruby/profiler/CacheProfiler.hh"
56 #include "mem/ruby/profiler/AddressProfiler.hh"
57 #include "mem/ruby/system/System.hh"
58 #include "mem/ruby/network/Network.hh"
59 #include "mem/gems_common/PrioHeap.hh"
60 #include "mem/protocol/CacheMsg.hh"
61 #include "mem/ruby/common/Driver.hh"
62 #include "mem/protocol/Protocol.hh"
63 #include "mem/gems_common/util.hh"
64 #include "mem/gems_common/Map.hh"
65 #include "mem/ruby/common/Debug.hh"
66 #include "mem/protocol/MachineType.hh"
68 // Allows use of times() library call, which determines virtual runtime
69 #include <sys/times.h>
71 extern std::ostream
* debug_cout_ptr
;
73 static double process_memory_total();
74 static double process_memory_resident();
77 : m_conflicting_histogram(-1)
79 m_requestProfileMap_ptr
= new Map
<string
, int>;
80 m_L1D_cache_profiler_ptr
= new CacheProfiler("L1D_cache");
81 m_L1I_cache_profiler_ptr
= new CacheProfiler("L1I_cache");
83 m_L2_cache_profiler_ptr
= new CacheProfiler("L2_cache");
85 m_address_profiler_ptr
= new AddressProfiler
;
86 m_inst_profiler_ptr
= NULL
;
87 if (PROFILE_ALL_INSTRUCTIONS
) {
88 m_inst_profiler_ptr
= new AddressProfiler
;
91 m_conflicting_map_ptr
= new Map
<Address
, Time
>;
93 m_real_time_start_time
= time(NULL
); // Not reset in clearStats()
94 m_stats_period
= 1000000; // Default
95 m_periodic_output_file_ptr
= &cerr
;
101 m_memReadWriteBusy
= 0;
102 m_memDataBusBusy
= 0;
115 int totalBanks
= RubyConfig::banksPerRank()
116 * RubyConfig::ranksPerDimm()
117 * RubyConfig::dimmsPerChannel();
118 m_memBankCount
.setSize(totalBanks
);
123 Profiler::~Profiler()
125 if (m_periodic_output_file_ptr
!= &cerr
) {
126 delete m_periodic_output_file_ptr
;
128 delete m_address_profiler_ptr
;
129 delete m_L1D_cache_profiler_ptr
;
130 delete m_L1I_cache_profiler_ptr
;
131 delete m_L2_cache_profiler_ptr
;
132 delete m_requestProfileMap_ptr
;
133 delete m_conflicting_map_ptr
;
136 void Profiler::wakeup()
138 // FIXME - avoid the repeated code
140 Vector
<integer_t
> perProcInstructionCount
;
141 perProcInstructionCount
.setSize(RubyConfig::numberOfProcessors());
143 Vector
<integer_t
> perProcCycleCount
;
144 perProcCycleCount
.setSize(RubyConfig::numberOfProcessors());
146 for(int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
147 perProcInstructionCount
[i
] = g_system_ptr
->getDriver()->getInstructionCount(i
) - m_instructions_executed_at_start
[i
] + 1;
148 perProcCycleCount
[i
] = g_system_ptr
->getDriver()->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
149 // The +1 allows us to avoid division by zero
152 integer_t total_misses
= m_perProcTotalMisses
.sum();
153 integer_t instruction_executed
= perProcInstructionCount
.sum();
154 integer_t cycles_executed
= perProcCycleCount
.sum();
155 integer_t transactions_started
= m_perProcStartTransaction
.sum();
156 integer_t transactions_ended
= m_perProcEndTransaction
.sum();
158 (*m_periodic_output_file_ptr
) << "ruby_cycles: " << g_eventQueue_ptr
->getTime()-m_ruby_start
<< endl
;
159 (*m_periodic_output_file_ptr
) << "total_misses: " << total_misses
<< " " << m_perProcTotalMisses
<< endl
;
160 (*m_periodic_output_file_ptr
) << "instruction_executed: " << instruction_executed
<< " " << perProcInstructionCount
<< endl
;
161 (*m_periodic_output_file_ptr
) << "cycles_executed: " << cycles_executed
<< " " << perProcCycleCount
<< endl
;
162 (*m_periodic_output_file_ptr
) << "transactions_started: " << transactions_started
<< " " << m_perProcStartTransaction
<< endl
;
163 (*m_periodic_output_file_ptr
) << "transactions_ended: " << transactions_ended
<< " " << m_perProcEndTransaction
<< endl
;
164 (*m_periodic_output_file_ptr
) << "L1TBE_usage: " << m_L1tbeProfile
<< endl
;
165 (*m_periodic_output_file_ptr
) << "L2TBE_usage: " << m_L2tbeProfile
<< endl
;
166 (*m_periodic_output_file_ptr
) << "mbytes_resident: " << process_memory_resident() << endl
;
167 (*m_periodic_output_file_ptr
) << "mbytes_total: " << process_memory_total() << endl
;
168 if (process_memory_total() > 0) {
169 (*m_periodic_output_file_ptr
) << "resident_ratio: " << process_memory_resident()/process_memory_total() << endl
;
171 (*m_periodic_output_file_ptr
) << "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
173 *m_periodic_output_file_ptr
<< endl
;
175 if (PROFILE_ALL_INSTRUCTIONS
) {
176 m_inst_profiler_ptr
->printStats(*m_periodic_output_file_ptr
);
179 //g_system_ptr->getNetwork()->printStats(*m_periodic_output_file_ptr);
180 g_eventQueue_ptr
->scheduleEvent(this, m_stats_period
);
183 void Profiler::setPeriodicStatsFile(const string
& filename
)
185 cout
<< "Recording periodic statistics to file '" << filename
<< "' every "
186 << m_stats_period
<< " Ruby cycles" << endl
;
188 if (m_periodic_output_file_ptr
!= &cerr
) {
189 delete m_periodic_output_file_ptr
;
192 m_periodic_output_file_ptr
= new ofstream(filename
.c_str());
193 g_eventQueue_ptr
->scheduleEvent(this, 1);
196 void Profiler::setPeriodicStatsInterval(integer_t period
)
198 cout
<< "Recording periodic statistics every " << m_stats_period
<< " Ruby cycles" << endl
;
199 m_stats_period
= period
;
200 g_eventQueue_ptr
->scheduleEvent(this, 1);
203 void Profiler::printConfig(ostream
& out
) const
206 out
<< "Profiler Configuration" << endl
;
207 out
<< "----------------------" << endl
;
208 out
<< "periodic_stats_period: " << m_stats_period
<< endl
;
211 void Profiler::print(ostream
& out
) const
216 void Profiler::printStats(ostream
& out
, bool short_stats
)
222 out
<< "Profiler Stats" << endl
;
223 out
<< "--------------" << endl
;
225 time_t real_time_current
= time(NULL
);
226 double seconds
= difftime(real_time_current
, m_real_time_start_time
);
227 double minutes
= seconds
/60.0;
228 double hours
= minutes
/60.0;
229 double days
= hours
/24.0;
230 Time ruby_cycles
= g_eventQueue_ptr
->getTime()-m_ruby_start
;
233 out
<< "Elapsed_time_in_seconds: " << seconds
<< endl
;
234 out
<< "Elapsed_time_in_minutes: " << minutes
<< endl
;
235 out
<< "Elapsed_time_in_hours: " << hours
<< endl
;
236 out
<< "Elapsed_time_in_days: " << days
<< endl
;
240 // print the virtual runtimes as well
243 seconds
= (vtime
.tms_utime
+ vtime
.tms_stime
) / 100.0;
244 minutes
= seconds
/ 60.0;
245 hours
= minutes
/ 60.0;
247 out
<< "Virtual_time_in_seconds: " << seconds
<< endl
;
248 out
<< "Virtual_time_in_minutes: " << minutes
<< endl
;
249 out
<< "Virtual_time_in_hours: " << hours
<< endl
;
250 out
<< "Virtual_time_in_days: " << hours
<< endl
;
253 out
<< "Ruby_current_time: " << g_eventQueue_ptr
->getTime() << endl
;
254 out
<< "Ruby_start_time: " << m_ruby_start
<< endl
;
255 out
<< "Ruby_cycles: " << ruby_cycles
<< endl
;
259 out
<< "mbytes_resident: " << process_memory_resident() << endl
;
260 out
<< "mbytes_total: " << process_memory_total() << endl
;
261 if (process_memory_total() > 0) {
262 out
<< "resident_ratio: " << process_memory_resident()/process_memory_total() << endl
;
266 if(m_num_BA_broadcasts
+ m_num_BA_unicasts
!= 0){
268 out
<< "Broadcast_percent: " << (float)m_num_BA_broadcasts
/(m_num_BA_broadcasts
+m_num_BA_unicasts
) << endl
;
272 Vector
<integer_t
> perProcInstructionCount
;
273 Vector
<integer_t
> perProcCycleCount
;
274 Vector
<double> perProcCPI
;
275 Vector
<double> perProcMissesPerInsn
;
276 Vector
<double> perProcInsnPerTrans
;
277 Vector
<double> perProcCyclesPerTrans
;
278 Vector
<double> perProcMissesPerTrans
;
280 perProcInstructionCount
.setSize(RubyConfig::numberOfProcessors());
281 perProcCycleCount
.setSize(RubyConfig::numberOfProcessors());
282 perProcCPI
.setSize(RubyConfig::numberOfProcessors());
283 perProcMissesPerInsn
.setSize(RubyConfig::numberOfProcessors());
285 perProcInsnPerTrans
.setSize(RubyConfig::numberOfProcessors());
286 perProcCyclesPerTrans
.setSize(RubyConfig::numberOfProcessors());
287 perProcMissesPerTrans
.setSize(RubyConfig::numberOfProcessors());
289 for(int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
290 perProcInstructionCount
[i
] = g_system_ptr
->getDriver()->getInstructionCount(i
) - m_instructions_executed_at_start
[i
] + 1;
291 perProcCycleCount
[i
] = g_system_ptr
->getDriver()->getCycleCount(i
) - m_cycles_executed_at_start
[i
] + 1;
292 // The +1 allows us to avoid division by zero
293 perProcCPI
[i
] = double(ruby_cycles
)/perProcInstructionCount
[i
];
294 perProcMissesPerInsn
[i
] = 1000.0 * (double(m_perProcTotalMisses
[i
]) / double(perProcInstructionCount
[i
]));
296 int trans
= m_perProcEndTransaction
[i
];
298 perProcInsnPerTrans
[i
] = 0;
299 perProcCyclesPerTrans
[i
] = 0;
300 perProcMissesPerTrans
[i
] = 0;
302 perProcInsnPerTrans
[i
] = perProcInstructionCount
[i
] / double(trans
);
303 perProcCyclesPerTrans
[i
] = ruby_cycles
/ double(trans
);
304 perProcMissesPerTrans
[i
] = m_perProcTotalMisses
[i
] / double(trans
);
308 integer_t total_misses
= m_perProcTotalMisses
.sum();
309 integer_t user_misses
= m_perProcUserMisses
.sum();
310 integer_t supervisor_misses
= m_perProcSupervisorMisses
.sum();
311 integer_t instruction_executed
= perProcInstructionCount
.sum();
312 integer_t cycles_executed
= perProcCycleCount
.sum();
313 integer_t transactions_started
= m_perProcStartTransaction
.sum();
314 integer_t transactions_ended
= m_perProcEndTransaction
.sum();
316 double instructions_per_transaction
= (transactions_ended
!= 0) ? double(instruction_executed
) / double(transactions_ended
) : 0;
317 double cycles_per_transaction
= (transactions_ended
!= 0) ? (RubyConfig::numberOfProcessors() * double(ruby_cycles
)) / double(transactions_ended
) : 0;
318 double misses_per_transaction
= (transactions_ended
!= 0) ? double(total_misses
) / double(transactions_ended
) : 0;
320 out
<< "Total_misses: " << total_misses
<< endl
;
321 out
<< "total_misses: " << total_misses
<< " " << m_perProcTotalMisses
<< endl
;
322 out
<< "user_misses: " << user_misses
<< " " << m_perProcUserMisses
<< endl
;
323 out
<< "supervisor_misses: " << supervisor_misses
<< " " << m_perProcSupervisorMisses
<< endl
;
325 out
<< "instruction_executed: " << instruction_executed
<< " " << perProcInstructionCount
<< endl
;
326 out
<< "cycles_executed: " << cycles_executed
<< " " << perProcCycleCount
<< endl
;
327 out
<< "cycles_per_instruction: " << (RubyConfig::numberOfProcessors()*double(ruby_cycles
))/double(instruction_executed
) << " " << perProcCPI
<< endl
;
328 out
<< "misses_per_thousand_instructions: " << 1000.0 * (double(total_misses
) / double(instruction_executed
)) << " " << perProcMissesPerInsn
<< endl
;
330 out
<< "transactions_started: " << transactions_started
<< " " << m_perProcStartTransaction
<< endl
;
331 out
<< "transactions_ended: " << transactions_ended
<< " " << m_perProcEndTransaction
<< endl
;
332 out
<< "instructions_per_transaction: " << instructions_per_transaction
<< " " << perProcInsnPerTrans
<< endl
;
333 out
<< "cycles_per_transaction: " << cycles_per_transaction
<< " " << perProcCyclesPerTrans
<< endl
;
334 out
<< "misses_per_transaction: " << misses_per_transaction
<< " " << perProcMissesPerTrans
<< endl
;
338 m_L1D_cache_profiler_ptr
->printStats(out
);
339 m_L1I_cache_profiler_ptr
->printStats(out
);
340 m_L2_cache_profiler_ptr
->printStats(out
);
344 if (m_memReq
|| m_memRefresh
) { // if there's a memory controller at all
345 long long int total_stalls
= m_memInputQ
+ m_memBankQ
+ m_memWaitCycles
;
346 double stallsPerReq
= total_stalls
* 1.0 / m_memReq
;
347 out
<< "Memory control:" << endl
;
348 out
<< " memory_total_requests: " << m_memReq
<< endl
; // does not include refreshes
349 out
<< " memory_reads: " << m_memRead
<< endl
;
350 out
<< " memory_writes: " << m_memWrite
<< endl
;
351 out
<< " memory_refreshes: " << m_memRefresh
<< endl
;
352 out
<< " memory_total_request_delays: " << total_stalls
<< endl
;
353 out
<< " memory_delays_per_request: " << stallsPerReq
<< endl
;
354 out
<< " memory_delays_in_input_queue: " << m_memInputQ
<< endl
;
355 out
<< " memory_delays_behind_head_of_bank_queue: " << m_memBankQ
<< endl
;
356 out
<< " memory_delays_stalled_at_head_of_bank_queue: " << m_memWaitCycles
<< endl
;
357 // Note: The following "memory stalls" entries are a breakdown of the
358 // cycles which already showed up in m_memWaitCycles. The order is
359 // significant; it is the priority of attributing the cycles.
360 // For example, bank_busy is before arbitration because if the bank was
361 // busy, we didn't even check arbitration.
362 // Note: "not old enough" means that since we grouped waiting heads-of-queues
363 // into batches to avoid starvation, a request in a newer batch
364 // didn't try to arbitrate yet because there are older requests waiting.
365 out
<< " memory_stalls_for_bank_busy: " << m_memBankBusy
<< endl
;
366 out
<< " memory_stalls_for_random_busy: " << m_memRandBusy
<< endl
;
367 out
<< " memory_stalls_for_anti_starvation: " << m_memNotOld
<< endl
;
368 out
<< " memory_stalls_for_arbitration: " << m_memArbWait
<< endl
;
369 out
<< " memory_stalls_for_bus: " << m_memBusBusy
<< endl
;
370 out
<< " memory_stalls_for_tfaw: " << m_memTfawBusy
<< endl
;
371 out
<< " memory_stalls_for_read_write_turnaround: " << m_memReadWriteBusy
<< endl
;
372 out
<< " memory_stalls_for_read_read_turnaround: " << m_memDataBusBusy
<< endl
;
373 out
<< " accesses_per_bank: ";
374 for (int bank
=0; bank
< m_memBankCount
.size(); bank
++) {
375 out
<< m_memBankCount
[bank
] << " ";
376 //if ((bank % 8) == 7) out << " " << endl;
383 out
<< "Busy Controller Counts:" << endl
;
384 for(int i
=0; i
< MachineType_NUM
; i
++) {
385 for(int j
=0; j
< MachineType_base_count((MachineType
)i
); j
++) {
387 machID
.type
= (MachineType
)i
;
389 out
<< machID
<< ":" << m_busyControllerCount
[i
][j
] << " ";
398 out
<< "Busy Bank Count:" << m_busyBankCount
<< endl
;
401 out
<< "L1TBE_usage: " << m_L1tbeProfile
<< endl
;
402 out
<< "L2TBE_usage: " << m_L2tbeProfile
<< endl
;
403 out
<< "StopTable_usage: " << m_stopTableProfile
<< endl
;
404 out
<< "sequencer_requests_outstanding: " << m_sequencer_requests
<< endl
;
405 out
<< "store_buffer_size: " << m_store_buffer_size
<< endl
;
406 out
<< "unique_blocks_in_store_buffer: " << m_store_buffer_blocks
<< endl
;
411 out
<< "All Non-Zero Cycle Demand Cache Accesses" << endl
;
412 out
<< "----------------------------------------" << endl
;
413 out
<< "miss_latency: " << m_allMissLatencyHistogram
<< endl
;
414 for(int i
=0; i
<m_missLatencyHistograms
.size(); i
++) {
415 if (m_missLatencyHistograms
[i
].size() > 0) {
416 out
<< "miss_latency_" << CacheRequestType(i
) << ": " << m_missLatencyHistograms
[i
] << endl
;
419 for(int i
=0; i
<m_machLatencyHistograms
.size(); i
++) {
420 if (m_machLatencyHistograms
[i
].size() > 0) {
421 out
<< "miss_latency_" << GenericMachineType(i
) << ": " << m_machLatencyHistograms
[i
] << endl
;
424 out
<< "miss_latency_L2Miss: " << m_L2MissLatencyHistogram
<< endl
;
428 out
<< "All Non-Zero Cycle SW Prefetch Requests" << endl
;
429 out
<< "------------------------------------" << endl
;
430 out
<< "prefetch_latency: " << m_allSWPrefetchLatencyHistogram
<< endl
;
431 for(int i
=0; i
<m_SWPrefetchLatencyHistograms
.size(); i
++) {
432 if (m_SWPrefetchLatencyHistograms
[i
].size() > 0) {
433 out
<< "prefetch_latency_" << CacheRequestType(i
) << ": " << m_SWPrefetchLatencyHistograms
[i
] << endl
;
436 for(int i
=0; i
<m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
437 if (m_SWPrefetchMachLatencyHistograms
[i
].size() > 0) {
438 out
<< "prefetch_latency_" << GenericMachineType(i
) << ": " << m_SWPrefetchMachLatencyHistograms
[i
] << endl
;
441 out
<< "prefetch_latency_L2Miss:" << m_SWPrefetchL2MissLatencyHistogram
<< endl
;
443 out
<< "multicast_retries: " << m_multicast_retry_histogram
<< endl
;
444 out
<< "gets_mask_prediction_count: " << m_gets_mask_prediction
<< endl
;
445 out
<< "getx_mask_prediction_count: " << m_getx_mask_prediction
<< endl
;
446 out
<< "explicit_training_mask: " << m_explicit_training_mask
<< endl
;
449 if (m_all_sharing_histogram
.size() > 0) {
450 out
<< "all_sharing: " << m_all_sharing_histogram
<< endl
;
451 out
<< "read_sharing: " << m_read_sharing_histogram
<< endl
;
452 out
<< "write_sharing: " << m_write_sharing_histogram
<< endl
;
454 out
<< "all_sharing_percent: "; m_all_sharing_histogram
.printPercent(out
); out
<< endl
;
455 out
<< "read_sharing_percent: "; m_read_sharing_histogram
.printPercent(out
); out
<< endl
;
456 out
<< "write_sharing_percent: "; m_write_sharing_histogram
.printPercent(out
); out
<< endl
;
458 int64 total_miss
= m_cache_to_cache
+ m_memory_to_cache
;
459 out
<< "all_misses: " << total_miss
<< endl
;
460 out
<< "cache_to_cache_misses: " << m_cache_to_cache
<< endl
;
461 out
<< "memory_to_cache_misses: " << m_memory_to_cache
<< endl
;
462 out
<< "cache_to_cache_percent: " << 100.0 * (double(m_cache_to_cache
) / double(total_miss
)) << endl
;
463 out
<< "memory_to_cache_percent: " << 100.0 * (double(m_memory_to_cache
) / double(total_miss
)) << endl
;
467 if (m_conflicting_histogram
.size() > 0) {
468 out
<< "conflicting_histogram: " << m_conflicting_histogram
<< endl
;
469 out
<< "conflicting_histogram_percent: "; m_conflicting_histogram
.printPercent(out
); out
<< endl
;
473 if (m_outstanding_requests
.size() > 0) {
474 out
<< "outstanding_requests: "; m_outstanding_requests
.printPercent(out
); out
<< endl
;
475 if (m_outstanding_persistent_requests
.size() > 0) {
476 out
<< "outstanding_persistent_requests: "; m_outstanding_persistent_requests
.printPercent(out
); out
<< endl
;
483 out
<< "Request vs. RubySystem State Profile" << endl
;
484 out
<< "--------------------------------" << endl
;
487 Vector
<string
> requestProfileKeys
= m_requestProfileMap_ptr
->keys();
488 requestProfileKeys
.sortVector();
490 for(int i
=0; i
<requestProfileKeys
.size(); i
++) {
491 int temp_int
= m_requestProfileMap_ptr
->lookup(requestProfileKeys
[i
]);
492 double percent
= (100.0*double(temp_int
))/double(m_requests
);
493 while (requestProfileKeys
[i
] != "") {
494 out
<< setw(10) << string_split(requestProfileKeys
[i
], ':');
496 out
<< setw(11) << temp_int
;
497 out
<< setw(14) << percent
<< endl
;
501 out
<< "filter_action: " << m_filter_action_histogram
<< endl
;
503 if (!PROFILE_ALL_INSTRUCTIONS
) {
504 m_address_profiler_ptr
->printStats(out
);
507 if (PROFILE_ALL_INSTRUCTIONS
) {
508 m_inst_profiler_ptr
->printStats(out
);
512 out
<< "Message Delayed Cycles" << endl
;
513 out
<< "----------------------" << endl
;
514 out
<< "Total_delay_cycles: " << m_delayedCyclesHistogram
<< endl
;
515 out
<< "Total_nonPF_delay_cycles: " << m_delayedCyclesNonPFHistogram
<< endl
;
516 for (int i
= 0; i
< m_delayedCyclesVCHistograms
.size(); i
++) {
517 out
<< " virtual_network_" << i
<< "_delay_cycles: " << m_delayedCyclesVCHistograms
[i
] << endl
;
520 printResourceUsage(out
);
525 void Profiler::printResourceUsage(ostream
& out
) const
528 out
<< "Resource Usage" << endl
;
529 out
<< "--------------" << endl
;
531 integer_t pagesize
= getpagesize(); // page size in bytes
532 out
<< "page_size: " << pagesize
<< endl
;
535 getrusage (RUSAGE_SELF
, &usage
);
537 out
<< "user_time: " << usage
.ru_utime
.tv_sec
<< endl
;
538 out
<< "system_time: " << usage
.ru_stime
.tv_sec
<< endl
;
539 out
<< "page_reclaims: " << usage
.ru_minflt
<< endl
;
540 out
<< "page_faults: " << usage
.ru_majflt
<< endl
;
541 out
<< "swaps: " << usage
.ru_nswap
<< endl
;
542 out
<< "block_inputs: " << usage
.ru_inblock
<< endl
;
543 out
<< "block_outputs: " << usage
.ru_oublock
<< endl
;
546 void Profiler::clearStats()
548 m_num_BA_unicasts
= 0;
549 m_num_BA_broadcasts
= 0;
551 m_ruby_start
= g_eventQueue_ptr
->getTime();
553 m_instructions_executed_at_start
.setSize(RubyConfig::numberOfProcessors());
554 m_cycles_executed_at_start
.setSize(RubyConfig::numberOfProcessors());
555 for (int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
556 if (g_system_ptr
== NULL
) {
557 m_instructions_executed_at_start
[i
] = 0;
558 m_cycles_executed_at_start
[i
] = 0;
560 m_instructions_executed_at_start
[i
] = g_system_ptr
->getDriver()->getInstructionCount(i
);
561 m_cycles_executed_at_start
[i
] = g_system_ptr
->getDriver()->getCycleCount(i
);
565 m_perProcTotalMisses
.setSize(RubyConfig::numberOfProcessors());
566 m_perProcUserMisses
.setSize(RubyConfig::numberOfProcessors());
567 m_perProcSupervisorMisses
.setSize(RubyConfig::numberOfProcessors());
568 m_perProcStartTransaction
.setSize(RubyConfig::numberOfProcessors());
569 m_perProcEndTransaction
.setSize(RubyConfig::numberOfProcessors());
571 for(int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
572 m_perProcTotalMisses
[i
] = 0;
573 m_perProcUserMisses
[i
] = 0;
574 m_perProcSupervisorMisses
[i
] = 0;
575 m_perProcStartTransaction
[i
] = 0;
576 m_perProcEndTransaction
[i
] = 0;
579 m_busyControllerCount
.setSize(MachineType_NUM
); // all machines
580 for(int i
=0; i
< MachineType_NUM
; i
++) {
581 m_busyControllerCount
[i
].setSize(MachineType_base_count((MachineType
)i
));
582 for(int j
=0; j
< MachineType_base_count((MachineType
)i
); j
++) {
583 m_busyControllerCount
[i
][j
] = 0;
588 m_delayedCyclesHistogram
.clear();
589 m_delayedCyclesNonPFHistogram
.clear();
590 m_delayedCyclesVCHistograms
.setSize(NUMBER_OF_VIRTUAL_NETWORKS
);
591 for (int i
= 0; i
< NUMBER_OF_VIRTUAL_NETWORKS
; i
++) {
592 m_delayedCyclesVCHistograms
[i
].clear();
595 m_gets_mask_prediction
.clear();
596 m_getx_mask_prediction
.clear();
597 m_explicit_training_mask
.clear();
599 m_missLatencyHistograms
.setSize(CacheRequestType_NUM
);
600 for(int i
=0; i
<m_missLatencyHistograms
.size(); i
++) {
601 m_missLatencyHistograms
[i
].clear(200);
603 m_machLatencyHistograms
.setSize(GenericMachineType_NUM
+1);
604 for(int i
=0; i
<m_machLatencyHistograms
.size(); i
++) {
605 m_machLatencyHistograms
[i
].clear(200);
607 m_allMissLatencyHistogram
.clear(200);
608 m_L2MissLatencyHistogram
.clear(200);
610 m_SWPrefetchLatencyHistograms
.setSize(CacheRequestType_NUM
);
611 for(int i
=0; i
<m_SWPrefetchLatencyHistograms
.size(); i
++) {
612 m_SWPrefetchLatencyHistograms
[i
].clear(200);
614 m_SWPrefetchMachLatencyHistograms
.setSize(GenericMachineType_NUM
+1);
615 for(int i
=0; i
<m_SWPrefetchMachLatencyHistograms
.size(); i
++) {
616 m_SWPrefetchMachLatencyHistograms
[i
].clear(200);
618 m_allSWPrefetchLatencyHistogram
.clear(200);
619 m_SWPrefetchL2MissLatencyHistogram
.clear(200);
621 m_multicast_retry_histogram
.clear();
623 m_L1tbeProfile
.clear();
624 m_L2tbeProfile
.clear();
625 m_stopTableProfile
.clear();
626 m_filter_action_histogram
.clear();
628 m_sequencer_requests
.clear();
629 m_store_buffer_size
.clear();
630 m_store_buffer_blocks
.clear();
631 m_read_sharing_histogram
.clear();
632 m_write_sharing_histogram
.clear();
633 m_all_sharing_histogram
.clear();
634 m_cache_to_cache
= 0;
635 m_memory_to_cache
= 0;
638 m_predictionOpportunities
= 0;
639 m_goodPredictions
= 0;
642 m_requestProfileMap_ptr
->clear();
644 // count requests profiled
647 // Conflicting requests
648 m_conflicting_map_ptr
->clear();
649 m_conflicting_histogram
.clear();
651 m_outstanding_requests
.clear();
652 m_outstanding_persistent_requests
.clear();
654 m_L1D_cache_profiler_ptr
->clearStats();
655 m_L1I_cache_profiler_ptr
->clearStats();
656 m_L2_cache_profiler_ptr
->clearStats();
658 // for MemoryControl:
663 m_memReadWriteBusy
= 0;
664 m_memDataBusBusy
= 0;
675 for (int bank
=0; bank
< m_memBankCount
.size(); bank
++) {
676 m_memBankCount
[bank
] = 0;
679 // Flush the prefetches through the system - used so that there are no outstanding requests after stats are cleared
680 //g_eventQueue_ptr->triggerAllEvents();
682 // update the start time
683 m_ruby_start
= g_eventQueue_ptr
->getTime();
686 void Profiler::addPrimaryStatSample(const CacheMsg
& msg
, NodeID id
)
688 if (Protocol::m_TwoLevelCache
) {
689 if (msg
.getType() == CacheRequestType_IFETCH
) {
690 addL1IStatSample(msg
, id
);
692 addL1DStatSample(msg
, id
);
694 // profile the address after an L1 miss (outside of the processor for CMP)
695 if (Protocol::m_CMP
) {
696 addAddressTraceSample(msg
, id
);
699 addL2StatSample(CacheRequestType_to_GenericRequestType(msg
.getType()),
700 msg
.getAccessMode(), msg
.getSize(), msg
.getPrefetch(), id
);
701 addAddressTraceSample(msg
, id
);
705 void Profiler::profileConflictingRequests(const Address
& addr
)
707 assert(addr
== line_address(addr
));
708 Time last_time
= m_ruby_start
;
709 if (m_conflicting_map_ptr
->exist(addr
)) {
710 last_time
= m_conflicting_map_ptr
->lookup(addr
);
712 Time current_time
= g_eventQueue_ptr
->getTime();
713 assert (current_time
- last_time
> 0);
714 m_conflicting_histogram
.add(current_time
- last_time
);
715 m_conflicting_map_ptr
->add(addr
, current_time
);
718 void Profiler::addSecondaryStatSample(CacheRequestType requestType
, AccessModeType type
, int msgSize
, PrefetchBit pfBit
, NodeID id
)
720 addSecondaryStatSample(CacheRequestType_to_GenericRequestType(requestType
), type
, msgSize
, pfBit
, id
);
723 void Profiler::addSecondaryStatSample(GenericRequestType requestType
, AccessModeType type
, int msgSize
, PrefetchBit pfBit
, NodeID id
)
725 addL2StatSample(requestType
, type
, msgSize
, pfBit
, id
);
728 void Profiler::addL2StatSample(GenericRequestType requestType
, AccessModeType type
, int msgSize
, PrefetchBit pfBit
, NodeID id
)
730 m_perProcTotalMisses
[id
]++;
731 if (type
== AccessModeType_SupervisorMode
) {
732 m_perProcSupervisorMisses
[id
]++;
734 m_perProcUserMisses
[id
]++;
736 m_L2_cache_profiler_ptr
->addStatSample(requestType
, type
, msgSize
, pfBit
);
739 void Profiler::addL1DStatSample(const CacheMsg
& msg
, NodeID id
)
741 m_L1D_cache_profiler_ptr
->addStatSample(CacheRequestType_to_GenericRequestType(msg
.getType()),
742 msg
.getAccessMode(), msg
.getSize(), msg
.getPrefetch());
745 void Profiler::addL1IStatSample(const CacheMsg
& msg
, NodeID id
)
747 m_L1I_cache_profiler_ptr
->addStatSample(CacheRequestType_to_GenericRequestType(msg
.getType()),
748 msg
.getAccessMode(), msg
.getSize(), msg
.getPrefetch());
751 void Profiler::addAddressTraceSample(const CacheMsg
& msg
, NodeID id
)
753 if (msg
.getType() != CacheRequestType_IFETCH
) {
755 // Note: The following line should be commented out if you want to
756 // use the special profiling that is part of the GS320 protocol
758 // NOTE: Unless PROFILE_HOT_LINES or PROFILE_ALL_INSTRUCTIONS are enabled, nothing will be profiled by the AddressProfiler
759 m_address_profiler_ptr
->addTraceSample(msg
.getAddress(), msg
.getProgramCounter(), msg
.getType(), msg
.getAccessMode(), id
, false);
763 void Profiler::profileSharing(const Address
& addr
, AccessType type
, NodeID requestor
, const Set
& sharers
, const Set
& owner
)
765 Set
set_contacted(owner
);
766 if (type
== AccessType_Write
) {
767 set_contacted
.addSet(sharers
);
769 set_contacted
.remove(requestor
);
770 int number_contacted
= set_contacted
.count();
772 if (type
== AccessType_Write
) {
773 m_write_sharing_histogram
.add(number_contacted
);
775 m_read_sharing_histogram
.add(number_contacted
);
777 m_all_sharing_histogram
.add(number_contacted
);
779 if (number_contacted
== 0) {
787 void Profiler::profileMsgDelay(int virtualNetwork
, int delayCycles
) {
788 assert(virtualNetwork
< m_delayedCyclesVCHistograms
.size());
789 m_delayedCyclesHistogram
.add(delayCycles
);
790 m_delayedCyclesVCHistograms
[virtualNetwork
].add(delayCycles
);
791 if (virtualNetwork
!= 0) {
792 m_delayedCyclesNonPFHistogram
.add(delayCycles
);
796 // profiles original cache requests including PUTs
797 void Profiler::profileRequest(const string
& requestStr
)
801 if (m_requestProfileMap_ptr
->exist(requestStr
)) {
802 (m_requestProfileMap_ptr
->lookup(requestStr
))++;
804 m_requestProfileMap_ptr
->add(requestStr
, 1);
808 void Profiler::recordPrediction(bool wasGood
, bool wasPredicted
)
810 m_predictionOpportunities
++;
819 void Profiler::profileFilterAction(int action
)
821 m_filter_action_histogram
.add(action
);
824 void Profiler::profileMulticastRetry(const Address
& addr
, int count
)
826 m_multicast_retry_histogram
.add(count
);
829 void Profiler::startTransaction(int cpu
)
831 m_perProcStartTransaction
[cpu
]++;
834 void Profiler::endTransaction(int cpu
)
836 m_perProcEndTransaction
[cpu
]++;
839 void Profiler::controllerBusy(MachineID machID
)
841 m_busyControllerCount
[(int)machID
.type
][(int)machID
.num
]++;
844 void Profiler::profilePFWait(Time waitTime
)
846 m_prefetchWaitHistogram
.add(waitTime
);
849 void Profiler::bankBusy()
854 // non-zero cycle demand request
855 void Profiler::missLatency(Time t
, CacheRequestType type
, GenericMachineType respondingMach
)
857 m_allMissLatencyHistogram
.add(t
);
858 m_missLatencyHistograms
[type
].add(t
);
859 m_machLatencyHistograms
[respondingMach
].add(t
);
860 if(respondingMach
== GenericMachineType_Directory
|| respondingMach
== GenericMachineType_NUM
) {
861 m_L2MissLatencyHistogram
.add(t
);
865 // non-zero cycle prefetch request
866 void Profiler::swPrefetchLatency(Time t
, CacheRequestType type
, GenericMachineType respondingMach
)
868 m_allSWPrefetchLatencyHistogram
.add(t
);
869 m_SWPrefetchLatencyHistograms
[type
].add(t
);
870 m_SWPrefetchMachLatencyHistograms
[respondingMach
].add(t
);
871 if(respondingMach
== GenericMachineType_Directory
|| respondingMach
== GenericMachineType_NUM
) {
872 m_SWPrefetchL2MissLatencyHistogram
.add(t
);
876 void Profiler::profileTransition(const string
& component
, NodeID id
, NodeID version
, Address addr
,
877 const string
& state
, const string
& event
,
878 const string
& next_state
, const string
& note
)
880 const int EVENT_SPACES
= 20;
881 const int ID_SPACES
= 3;
882 const int TIME_SPACES
= 7;
883 const int COMP_SPACES
= 10;
884 const int STATE_SPACES
= 6;
886 if ((g_debug_ptr
->getDebugTime() > 0) &&
887 (g_eventQueue_ptr
->getTime() >= g_debug_ptr
->getDebugTime())) {
888 (* debug_cout_ptr
).flags(ios::right
);
889 (* debug_cout_ptr
) << setw(TIME_SPACES
) << g_eventQueue_ptr
->getTime() << " ";
890 (* debug_cout_ptr
) << setw(ID_SPACES
) << id
<< " ";
891 (* debug_cout_ptr
) << setw(ID_SPACES
) << version
<< " ";
892 (* debug_cout_ptr
) << setw(COMP_SPACES
) << component
;
893 (* debug_cout_ptr
) << setw(EVENT_SPACES
) << event
<< " ";
894 for (int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
897 (* debug_cout_ptr
).flags(ios::right
);
898 (* debug_cout_ptr
) << setw(STATE_SPACES
) << state
;
899 (* debug_cout_ptr
) << ">";
900 (* debug_cout_ptr
).flags(ios::left
);
901 (* debug_cout_ptr
) << setw(STATE_SPACES
) << next_state
;
903 // cout << setw(STATE_SPACES) << " " << " " << setw(STATE_SPACES) << " ";
906 (* debug_cout_ptr
) << " " << addr
<< " " << note
;
908 (* debug_cout_ptr
) << endl
;
913 static double process_memory_total()
915 const double MULTIPLIER
= 4096.0/(1024.0*1024.0); // 4kB page size, 1024*1024 bytes per MB,
917 proc_file
.open("/proc/self/statm");
918 int total_size_in_pages
= 0;
919 int res_size_in_pages
= 0;
920 proc_file
>> total_size_in_pages
;
921 proc_file
>> res_size_in_pages
;
922 return double(total_size_in_pages
)*MULTIPLIER
; // size in megabytes
925 static double process_memory_resident()
927 const double MULTIPLIER
= 4096.0/(1024.0*1024.0); // 4kB page size, 1024*1024 bytes per MB,
929 proc_file
.open("/proc/self/statm");
930 int total_size_in_pages
= 0;
931 int res_size_in_pages
= 0;
932 proc_file
>> total_size_in_pages
;
933 proc_file
>> res_size_in_pages
;
934 return double(res_size_in_pages
)*MULTIPLIER
; // size in megabytes
937 void Profiler::profileGetXMaskPrediction(const Set
& pred_set
)
939 m_getx_mask_prediction
.add(pred_set
.count());
942 void Profiler::profileGetSMaskPrediction(const Set
& pred_set
)
944 m_gets_mask_prediction
.add(pred_set
.count());
947 void Profiler::profileTrainingMask(const Set
& pred_set
)
949 m_explicit_training_mask
.add(pred_set
.count());
952 // For MemoryControl:
953 void Profiler::profileMemReq(int bank
) {
955 m_memBankCount
[bank
]++;
958 void Profiler::profileMemBankBusy() { m_memBankBusy
++; }
959 void Profiler::profileMemBusBusy() { m_memBusBusy
++; }
960 void Profiler::profileMemReadWriteBusy() { m_memReadWriteBusy
++; }
961 void Profiler::profileMemDataBusBusy() { m_memDataBusBusy
++; }
962 void Profiler::profileMemTfawBusy() { m_memTfawBusy
++; }
963 void Profiler::profileMemRefresh() { m_memRefresh
++; }
964 void Profiler::profileMemRead() { m_memRead
++; }
965 void Profiler::profileMemWrite() { m_memWrite
++; }
966 void Profiler::profileMemWaitCycles(int cycles
) { m_memWaitCycles
+= cycles
; }
967 void Profiler::profileMemInputQ(int cycles
) { m_memInputQ
+= cycles
; }
968 void Profiler::profileMemBankQ(int cycles
) { m_memBankQ
+= cycles
; }
969 void Profiler::profileMemArbWait(int cycles
) { m_memArbWait
+= cycles
; }
970 void Profiler::profileMemRandBusy() { m_memRandBusy
++; }
971 void Profiler::profileMemNotOld() { m_memNotOld
++; }
973 int64
Profiler::getTotalInstructionsExecuted() const
975 int64 sum
= 1; // Starting at 1 allows us to avoid division by zero
976 for(int i
=0; i
< RubyConfig::numberOfProcessors(); i
++) {
977 sum
+= (g_system_ptr
->getDriver()->getInstructionCount(i
) - m_instructions_executed_at_start
[i
]);
982 int64
Profiler::getTotalTransactionsExecuted() const
984 int64 sum
= m_perProcEndTransaction
.sum();
988 return 1; // Avoid division by zero errors
993 // The following case statement converts CacheRequestTypes to GenericRequestTypes
994 // allowing all profiling to be done with a single enum type instead of slow strings
995 GenericRequestType
Profiler::CacheRequestType_to_GenericRequestType(const CacheRequestType
& type
) {
997 case CacheRequestType_LD
:
998 return GenericRequestType_LD
;
1000 case CacheRequestType_ST
:
1001 return GenericRequestType_ST
;
1003 case CacheRequestType_ATOMIC
:
1004 return GenericRequestType_ATOMIC
;
1006 case CacheRequestType_IFETCH
:
1007 return GenericRequestType_IFETCH
;
1009 case CacheRequestType_NULL
:
1010 return GenericRequestType_NULL
;
1013 ERROR_MSG("Unexpected cache request type");