This patch changes the way that Ruby handles atomic RMW instructions. This implementation, unlike the prior one, is protocol independent. It works by locking an address from the sequencer immediately after the read portion of an RMW completes. When that address is locked, the coherence controller will only satisfy requests coming from one port (e.g., the mandatory queue) and will ignore all others. After the write portion completed, the line is unlocked. This should also work with multi-line atomics, as long as the blocks are always acquired in the same order.
// Response IntraChip L1 Network - response msg to this L1 cache
in_port(responseIntraChipL1Network_in, ResponseMsg, responseToL1Cache) {
if (responseIntraChipL1Network_in.isReady()) {
- peek(responseIntraChipL1Network_in, ResponseMsg) {
+ peek(responseIntraChipL1Network_in, ResponseMsg, block_on="Address") {
assert(in_msg.Destination.isElement(machineID));
if(in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
trigger(Event:Data_Exclusive, in_msg.Address);
// Request InterChip network - request from this L1 cache to the shared L2
in_port(requestIntraChipL1Network_in, RequestMsg, requestToL1Cache) {
if(requestIntraChipL1Network_in.isReady()) {
- peek(requestIntraChipL1Network_in, RequestMsg) {
+ peek(requestIntraChipL1Network_in, RequestMsg, block_on="Address") {
assert(in_msg.Destination.isElement(machineID));
if (in_msg.Type == CoherenceRequestType:INV) {
trigger(Event:Inv, in_msg.Address);
// Mandatory Queue betweens Node's CPU and it's L1 caches
in_port(mandatoryQueue_in, CacheMsg, mandatoryQueue, desc="...") {
if (mandatoryQueue_in.isReady()) {
- peek(mandatoryQueue_in, CacheMsg) {
+ peek(mandatoryQueue_in, CacheMsg, block_on="LineAddress") {
// Check for data access to blocks in I-cache and ifetchs to blocks in D-cache
in_port(forwardRequestNetwork_in, RequestMsg, forwardToCache) {
if (forwardRequestNetwork_in.isReady()) {
- peek(forwardRequestNetwork_in, RequestMsg) {
+ peek(forwardRequestNetwork_in, RequestMsg, block_on="Address") {
if (in_msg.Type == CoherenceRequestType:GETX) {
trigger(Event:Fwd_GETX, in_msg.Address);
}
in_port(responseNetwork_in, ResponseMsg, responseToCache) {
if (responseNetwork_in.isReady()) {
- peek(responseNetwork_in, ResponseMsg) {
+ peek(responseNetwork_in, ResponseMsg, block_on="Address") {
if (in_msg.Type == CoherenceResponseType:DATA) {
trigger(Event:Data, in_msg.Address);
}
// Mandatory Queue
in_port(mandatoryQueue_in, CacheMsg, mandatoryQueue, desc="...") {
if (mandatoryQueue_in.isReady()) {
- peek(mandatoryQueue_in, CacheMsg) {
+ peek(mandatoryQueue_in, CacheMsg, block_on="LineAddress") {
if (cacheMemory.isTagPresent(in_msg.LineAddress) == false &&
// Request Network
in_port(requestNetwork_in, RequestMsg, requestToL1Cache) {
if (requestNetwork_in.isReady()) {
- peek(requestNetwork_in, RequestMsg) {
+ peek(requestNetwork_in, RequestMsg, block_on="Address") {
assert(in_msg.Destination.isElement(machineID));
DEBUG_EXPR("MRM_DEBUG: L1 received");
DEBUG_EXPR(in_msg.Type);
// Response Network
in_port(responseToL1Cache_in, ResponseMsg, responseToL1Cache) {
if (responseToL1Cache_in.isReady()) {
- peek(responseToL1Cache_in, ResponseMsg) {
+ peek(responseToL1Cache_in, ResponseMsg, block_on="Address") {
if (in_msg.Type == CoherenceResponseType:ACK) {
trigger(Event:Ack, in_msg.Address);
} else if (in_msg.Type == CoherenceResponseType:DATA) {
// Mandatory Queue betweens Node's CPU and it's L1 caches
in_port(mandatoryQueue_in, CacheMsg, mandatoryQueue, desc="...") {
if (mandatoryQueue_in.isReady()) {
- peek(mandatoryQueue_in, CacheMsg) {
+ peek(mandatoryQueue_in, CacheMsg, block_on="LineAddress") {
// Check for data access to blocks in I-cache and ifetchs to blocks in D-cache
(m_prio_heap.peekMin().m_time <= g_eventQueue_ptr->getTime()));
}
+ void delayHead() {
+ MessageBufferNode node = m_prio_heap.extractMin();
+ enqueue(node.m_msgptr, 1);
+ }
+
bool areNSlotsAvailable(int n);
int getPriority() { return m_priority_rank; }
void setPriority(int rank) { m_priority_rank = rank; }
# default values
num_cores = 2
-l1_cache_size_kb = 32768
+l1_cache_size_bytes = 32768
l1_cache_assoc = 8
l1_cache_latency = 1
num_memories = 2
require protocol+".rb"
num_cores.times { |n|
- icache = SetAssociativeCache.new("l1i_"+n.to_s, l1_icache_size_kb, l1_icache_latency, l1_icache_assoc, "PSEUDO_LRU")
- dcache = SetAssociativeCache.new("l1d_"+n.to_s, l1_dcache_size_kb, l1_dcache_latency, l1_dcache_assoc, "PSEUDO_LRU")
+ icache = SetAssociativeCache.new("l1i_"+n.to_s, l1_icache_size_kb*1024, l1_icache_latency, l1_icache_assoc, "PSEUDO_LRU")
+ dcache = SetAssociativeCache.new("l1d_"+n.to_s, l1_dcache_size_kb*1024, l1_dcache_latency, l1_dcache_assoc, "PSEUDO_LRU")
sequencer = Sequencer.new("Sequencer_"+n.to_s, icache, dcache)
iface_ports << sequencer
if protocol == "MOESI_CMP_directory"
end
}
num_l2_banks.times { |n|
- cache = SetAssociativeCache.new("l2u_"+n.to_s, l2_cache_size_kb/num_l2_banks, l2_cache_latency, l2_cache_assoc, "PSEUDO_LRU")
+ cache = SetAssociativeCache.new("l2u_"+n.to_s, (l2_cache_size_kb*1024)/num_l2_banks, l2_cache_latency, l2_cache_assoc, "PSEUDO_LRU")
if protocol == "MOESI_CMP_directory"
net_ports << MOESI_CMP_directory_L2CacheController.new("L2CacheController_"+n.to_s,
"L2Cache",
end
class Cache < LibRubyObject
- param :size_kb, Integer
+ param :size, Integer
param :latency, Integer
param :controller, NetPort
- def initialize(obj_name, size_kb, latency)
+ def initialize(obj_name, size, latency)
super(obj_name)
- self.size_kb = size_kb
+ self.size = size
self.latency = latency
# controller must be set manually by the configuration script
# because there is a cyclic dependence
# when an integer, it represents the number of cycles for a hit
# when a float, it represents the cache access time in ns
# when set to "auto", libruby will attempt to find a realistic latency by running CACTI
- def initialize(obj_name, size_kb, latency, assoc, replacement_policy)
- super(obj_name, size_kb, latency)
+ def initialize(obj_name, size, latency, assoc, replacement_policy)
+ super(obj_name, size, latency)
self.assoc = assoc
self.replacement_policy = replacement_policy
end
def calculateLatency()
if self.latency == "auto"
cacti_args = Array.new()
- cacti_args << (self.size_kb*1024) << RubySystem.block_size_bytes << self.assoc
+ cacti_args << (self.size*1024) << RubySystem.block_size_bytes << self.assoc
cacti_args << 1 << 0 << 0 << 0 << 1
cacti_args << RubySystem.tech_nm << RubySystem.block_size_bytes*8
cacti_args << 0 << 0 << 0 << 1 << 0 << 0 << 0 << 0 << 1
ostream& operator<<(ostream& out, const RubyRequestType& obj)
{
- cerr << "in op" << endl;
out << RubyRequestType_to_string(obj);
- cerr << "flushing" << endl;
out << flush;
- cerr << "done" << endl;
return out;
}
virtual const string toString() const = 0; // returns text version of controller type
virtual const string getName() const = 0; // return instance name
virtual const MachineType getMachineType() const = 0;
- virtual void set_atomic(Address addr) = 0;
- virtual void clear_atomic(Address addr) = 0;
- virtual void reset_atomics() = 0;
+ virtual void blockOnQueue(Address, MessageBuffer*) = 0;
+ virtual void unblock(Address) = 0;
virtual void print(ostream & out) const = 0;
virtual void printStats(ostream & out) const = 0;
m_instCache_ptr = NULL;
m_dataCache_ptr = NULL;
m_controller = NULL;
- m_atomic_reads = 0;
- m_atomic_writes = 0;
for (size_t i=0; i<argv.size(); i+=2) {
if ( argv[i] == "controller") {
m_controller = RubySystem::getController(argv[i+1]); // args[i] = "L1Cache"
(request->ruby_request.type == RubyRequestType_RMW_Write) ||
(request->ruby_request.type == RubyRequestType_Locked_Read) ||
(request->ruby_request.type == RubyRequestType_Locked_Write));
- // POLINA: the assumption is that atomics are only on data cache and not instruction cache
+
if (request->ruby_request.type == RubyRequestType_Locked_Read) {
m_dataCache_ptr->setLocked(address, m_version);
}
else if (request->ruby_request.type == RubyRequestType_RMW_Read) {
- m_controller->set_atomic(address);
+ m_controller->blockOnQueue(address, m_mandatory_q_ptr);
}
else if (request->ruby_request.type == RubyRequestType_RMW_Write) {
- m_controller->clear_atomic(address);
+ m_controller->unblock(address);
}
hitCallback(request, data);
CacheRequestType ctype;
switch(request.type) {
case RubyRequestType_IFETCH:
- if (m_atomic_reads > 0 && m_atomic_writes == 0) {
- m_controller->reset_atomics();
- m_atomic_writes = 0;
- m_atomic_reads = 0;
- }
- else if (m_atomic_writes > 0) {
- assert(m_atomic_reads > m_atomic_writes);
- cerr << "WARNING: Expected: " << m_atomic_reads << " RMW_Writes, but only received: " << m_atomic_writes << endl;
- assert(false);
- }
ctype = CacheRequestType_IFETCH;
break;
case RubyRequestType_LD:
- if (m_atomic_reads > 0 && m_atomic_writes == 0) {
- m_controller->reset_atomics();
- m_atomic_writes = 0;
- m_atomic_reads = 0;
- }
- else if (m_atomic_writes > 0) {
- assert(m_atomic_reads > m_atomic_writes);
- cerr << "WARNING: Expected: " << m_atomic_reads << " RMW_Writes, but only received: " << m_atomic_writes << endl;
- assert(false);
- }
ctype = CacheRequestType_LD;
break;
case RubyRequestType_ST:
- if (m_atomic_reads > 0 && m_atomic_writes == 0) {
- m_controller->reset_atomics();
- m_atomic_writes = 0;
- m_atomic_reads = 0;
- }
- else if (m_atomic_writes > 0) {
- assert(m_atomic_reads > m_atomic_writes);
- cerr << "WARNING: Expected: " << m_atomic_reads << " RMW_Writes, but only received: " << m_atomic_writes << endl;
- assert(false);
- }
ctype = CacheRequestType_ST;
break;
case RubyRequestType_Locked_Read:
ctype = CacheRequestType_ATOMIC;
break;
case RubyRequestType_RMW_Read:
- assert(m_atomic_writes == 0);
- m_atomic_reads++;
ctype = CacheRequestType_ATOMIC;
break;
case RubyRequestType_RMW_Write:
- assert(m_atomic_reads > 0);
- assert(m_atomic_writes < m_atomic_reads);
- m_atomic_writes++;
- if (m_atomic_reads == m_atomic_writes) {
- m_atomic_reads = 0;
- m_atomic_writes = 0;
- }
ctype = CacheRequestType_ATOMIC;
break;
default:
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
- int m_atomic_reads;
- int m_atomic_writes;
int m_store_waiting_on_load_cycles;
int m_store_waiting_on_store_cycles;
from slicc.symbols import Var
class PeekStatementAST(StatementAST):
- def __init__(self, slicc, queue_name, type_ast, statements, method):
- super(PeekStatementAST, self).__init__(slicc)
+ def __init__(self, slicc, queue_name, type_ast, pairs, statements, method):
+ super(PeekStatementAST, self).__init__(slicc, pairs)
self.queue_name = queue_name
self.type_ast = type_ast
in_msg_ptr = dynamic_cast<const $mtid *>(($qcode).${{self.method}}());
assert(in_msg_ptr != NULL);
''')
+ if self.pairs.has_key("block_on"):
+ address_field = self.pairs['block_on']
+ code('''
+ if ( (m_is_blocking == true) &&
+ (m_block_map.count(in_msg_ptr->m_$address_field) == 1) ) {
+ if (m_block_map[in_msg_ptr->m_$address_field] != &$qcode) {
+ $qcode.delayHead();
+ continue;
+ }
+ }
+ ''')
# The other statements
self.statements.generate(code, return_type)
p[0] = ast.EnqueueStatementAST(self, p[3], p[5], p[6], p[8])
def p_statement__peek(self, p):
- "statement : PEEK '(' var ',' type ')' statements"
- p[0] = ast.PeekStatementAST(self, p[3], p[5], p[7], "peek")
+ "statement : PEEK '(' var ',' type pairs ')' statements"
+ p[0] = ast.PeekStatementAST(self, p[3], p[5], p[6], p[8], "peek")
def p_statement__copy_head(self, p):
"statement : COPY_HEAD '(' var ',' var pairs ')' SEMI"
void print(ostream& out) const;
void printConfig(ostream& out) const;
void wakeup();
- void set_atomic(Address addr);
- void clear_atomic(Address addr);
- void reset_atomics();
void printStats(ostream& out) const { s_profiler.dumpStats(out); }
void clearStats() { s_profiler.clearStats(); }
+ void blockOnQueue(Address addr, MessageBuffer* port);
+ void unblock(Address addr);
private:
''')
for param in self.config_parameters:
code('int m_${{param.ident}};')
- if self.ident == "L1Cache":
- code('''
-int servicing_atomic;
-Address locked_read_request1;
-Address locked_read_request2;
-Address locked_read_request3;
-Address locked_read_request4;
-int read_counter;
-''')
-
code('''
int m_number_of_TBEs;
NodeID m_version;
Network* m_net_ptr;
MachineID m_machineID;
+bool m_is_blocking;
+map< Address, MessageBuffer* > m_block_map;
${ident}_Profiler s_profiler;
static int m_num_controllers;
// Internal functions
{
''')
code.indent()
- if self.ident == "L1Cache":
- code('''
-servicing_atomic = 0;
-locked_read_request1 = Address(-1);
-locked_read_request2 = Address(-1);
-locked_read_request3 = Address(-1);
-locked_read_request4 = Address(-1);
-read_counter = 0;
-''')
code('m_num_controllers++;')
for var in self.objects:
return MachineType_${ident};
}
+void $c_ident::blockOnQueue(Address addr, MessageBuffer* port) {
+ m_is_blocking = true;
+ m_block_map[addr] = port;
+}
+void $c_ident::unblock(Address addr) {
+ m_block_map.erase(addr);
+ if (m_block_map.size() == 0) {
+ m_is_blocking = false;
+ }
+}
+
void $c_ident::print(ostream& out) const { out << "[$c_ident " << m_version << "]"; }
void $c_ident::printConfig(ostream& out) const {
# InPorts
#
- # Find the position of the mandatory queue in the vector so
- # that we can print it out first
-
- mandatory_q = None
- if self.ident == "L1Cache":
- for i,port in enumerate(self.in_ports):
- assert "c_code_in_port" in port
- if str(port).find("mandatoryQueue_in") >= 0:
- assert mandatory_q is None
- mandatory_q = port
-
- assert mandatory_q is not None
-
- # print out the mandatory queue here
- port = mandatory_q
- code('// ${ident}InPort $port')
- output = port["c_code_in_port"]
-
- code('$output')
-
for port in self.in_ports:
- # don't print out mandatory queue twice
- if port == mandatory_q:
- continue
-
- if ident == "L1Cache":
- if (str(port).find("forwardRequestNetwork_in") >= 0 or str(port).find("requestNetwork_in") >= 0 or str(port).find("requestIntraChipL1Network_in") >= 0):
- code('''
-bool postpone = false;
-if ((((*m_L1Cache_forwardToCache_ptr)).isReady())) {
- const RequestMsg* in_msg_ptr;
- in_msg_ptr = dynamic_cast<const RequestMsg*>(((*m_L1Cache_forwardToCache_ptr)).peek());
- if ((((servicing_atomic > 0) && (locked_read_request1 == ((*in_msg_ptr)).m_Address || locked_read_request2 == ((*in_msg_ptr)).m_Address || locked_read_request3 == ((*in_msg_ptr)).m_Address || locked_read_request1 == ((*in_msg_ptr)).m_Address)))) {
- postpone = true;
- }
-}
-if (!postpone) {
-''')
code.indent()
code('// ${ident}InPort $port')
code('${{port["c_code_in_port"]}}')
code.dedent()
- if ident == "L1Cache":
- if (str(port).find("forwardRequestNetwork_in") >= 0 or str(port).find("requestNetwork_in") >= 0 or str(port).find("requestIntraChipL1Network_in") >= 0):
- code.dedent()
- code('}')
- code.indent()
code('')
code.dedent()
}
''')
- if self.ident == "L1Cache":
- code('''
-void ${ident}_Controller::set_atomic(Address addr)
-{
- servicing_atomic++;
- switch (servicing_atomic) {
- case(1):
- assert(locked_read_request1 == Address(-1));
- locked_read_request1 = addr;
- break;
- case(2):
- assert(locked_read_request2 == Address(-1));
- locked_read_request2 = addr;
- break;
- case(3):
- assert(locked_read_request3 == Address(-1));
- locked_read_request3 = addr;
- break;
- case(4):
- assert(locked_read_request4 == Address(-1));
- locked_read_request4 = addr;
- break;
- default:
- assert(0);
-
- }
-}
-
-void ${ident}_Controller::clear_atomic(Address addr)
-{
-
- assert(servicing_atomic > 0);
- if (addr == locked_read_request1)
- locked_read_request1 = Address(-1);
- else if (addr == locked_read_request2)
- locked_read_request2 = Address(-1);
- else if (addr == locked_read_request3)
- locked_read_request3 = Address(-1);
- else if (addr == locked_read_request4)
- locked_read_request4 = Address(-1);
- else
- assert(0);
- servicing_atomic--;
-
-}
-
-void ${ident}_Controller::reset_atomics()
-{
-
- servicing_atomic = 0;
- locked_read_request1 = Address(-1);
- locked_read_request2 = Address(-1);
- locked_read_request3 = Address(-1);
- locked_read_request4 = Address(-1);
-
-}
-
-''')
- else:
- code('''
-void ${ident}_Controller::reset_atomics()
-{
- assert(0);
-}
-
-void ${ident}_Controller::set_atomic(Address addr)
-{
- assert(0);
-}
-
-void ${ident}_Controller::clear_atomic(Address addr)
-{
- assert(0);
-}
-''')
-
-
code.write(path, "%s_Wakeup.cc" % self.ident)
def printCSwitch(self, path):