2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/lds_state.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
45 * the default constructor that works with SWIG
47 LdsState::LdsState(const Params
*params
) :
48 ClockedObject(params
),
50 cuPort(name() + ".port", this),
51 maximumSize(params
->size
),
53 bankConflictPenalty(params
->bankConflictPenalty
),
56 fatal_if(params
->banks
<= 0,
57 "Number of LDS banks should be positive number");
58 fatal_if((params
->banks
& (params
->banks
- 1)) != 0,
59 "Number of LDS banks should be a power of 2");
60 fatal_if(params
->size
<= 0,
61 "cannot allocate an LDS with a size less than 1");
62 fatal_if(params
->size
% 2,
63 "the LDS should be an even number");
67 * Needed by the SWIG compiler
70 LdsStateParams::create()
72 return new LdsState(this);
76 * set the parent and name based on the parent
79 LdsState::setParent(ComputeUnit
*x_parent
)
81 // check that this gets assigned to the same thing each time
82 fatal_if(!x_parent
, "x_parent should not be nullptr");
83 fatal_if(x_parent
== parent
,
84 "should not be setting the parent twice");
87 _name
= x_parent
->name() + ".LdsState";
91 * derive the gpu mem packet from the packet and then count the bank conflicts
94 LdsState::countBankConflicts(PacketPtr packet
, unsigned *bankAccesses
)
96 Packet::SenderState
*baseSenderState
= packet
->senderState
;
97 while (baseSenderState
->predecessor
) {
98 baseSenderState
= baseSenderState
->predecessor
;
100 const ComputeUnit::LDSPort::SenderState
*senderState
=
101 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(baseSenderState
);
103 fatal_if(!senderState
,
104 "did not get the right sort of sender state");
106 GPUDynInstPtr gpuDynInst
= senderState
->getMemInst();
108 return countBankConflicts(gpuDynInst
, bankAccesses
);
111 // Count the total number of bank conflicts for the local memory packet
113 LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst
,
114 unsigned *numBankAccesses
)
116 int bank_conflicts
= 0;
117 std::vector
<int> bank
;
118 // the number of LDS banks being touched by the memory instruction
119 int numBanks
= std::min(parent
->wfSize(), banks
);
120 // if the wavefront size is larger than the number of LDS banks, we
121 // need to iterate over all work items to calculate the total
122 // number of bank conflicts
123 int groups
= (parent
->wfSize() > numBanks
) ?
124 (parent
->wfSize() / numBanks
) : 1;
125 for (int i
= 0; i
< groups
; i
++) {
126 // Address Array holding all the work item addresses of an instruction
127 std::vector
<Addr
> addr_array
;
128 addr_array
.resize(numBanks
, 0);
130 bank
.resize(banks
, 0);
133 // populate the address array for all active work items
134 for (int j
= 0; j
< numBanks
; j
++) {
135 if (gpuDynInst
->exec_mask
[(i
*numBanks
)+j
]) {
136 addr_array
[j
] = gpuDynInst
->addr
[(i
*numBanks
)+j
];
138 addr_array
[j
] = std::numeric_limits
<Addr
>::max();
142 if (gpuDynInst
->isLoad() || gpuDynInst
->isStore()) {
143 // mask identical addresses
144 for (int j
= 0; j
< numBanks
; ++j
) {
145 for (int j0
= 0; j0
< j
; j0
++) {
146 if (addr_array
[j
] != std::numeric_limits
<Addr
>::max()
147 && addr_array
[j
] == addr_array
[j0
]) {
148 addr_array
[j
] = std::numeric_limits
<Addr
>::max();
153 // calculate bank conflicts
154 for (int j
= 0; j
< numBanks
; ++j
) {
155 if (addr_array
[j
] != std::numeric_limits
<Addr
>::max()) {
156 int bankId
= addr_array
[j
] % banks
;
158 max_bank
= std::max(max_bank
, bank
[bankId
]);
159 // Count the number of LDS banks accessed.
160 // Since we have masked identical addresses all remaining
161 // accesses will need to be serialized if they access
162 // the same bank (bank conflict).
163 (*numBankAccesses
)++;
166 bank_conflicts
+= max_bank
;
168 panic_if(bank_conflicts
> parent
->wfSize(),
169 "Max bank conflicts should match num of work items per instr");
170 return bank_conflicts
;
174 * receive the packet from the CU
177 LdsState::CuSidePort::recvTimingReq(PacketPtr packet
)
179 return ownerLds
->processPacket(packet
);
183 LdsState::getDynInstr(PacketPtr packet
)
185 ComputeUnit::LDSPort::SenderState
*ss
=
186 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(
187 packet
->senderState
);
188 return ss
->getMemInst();
192 * process an incoming packet, add it to the return queue
195 LdsState::processPacket(PacketPtr packet
)
197 unsigned bankAccesses
= 0;
198 // the number of conflicts this packet will have when accessing the LDS
199 unsigned bankConflicts
= countBankConflicts(packet
, &bankAccesses
);
200 // count the total number of physical LDS bank accessed
201 parent
->ldsBankAccesses
+= bankAccesses
;
202 // count the LDS bank conflicts. A number set to 1 indicates one
203 // access per bank maximum so there are no bank conflicts
204 parent
->ldsBankConflictDist
.sample(bankConflicts
-1);
206 GPUDynInstPtr dynInst
= getDynInstr(packet
);
207 // account for the LDS bank conflict overhead
208 int busLength
= (dynInst
->isLoad()) ? parent
->loadBusLength() :
209 (dynInst
->isStore()) ? parent
->storeBusLength() :
210 parent
->loadBusLength();
211 // delay for accessing the LDS
212 Tick processingTime
=
213 parent
->cyclesToTicks(Cycles(bankConflicts
* bankConflictPenalty
)) +
214 parent
->cyclesToTicks(Cycles(busLength
));
215 // choose (delay + last packet in queue) or (now + delay) as the time to
217 Tick doneAt
= earliestReturnTime() + processingTime
;
218 // then store it for processing
219 return returnQueuePush(std::make_pair(doneAt
, packet
));
223 * add this to the queue of packets to be returned
226 LdsState::returnQueuePush(std::pair
<Tick
, PacketPtr
> thePair
)
228 // TODO add time limits (e.g. one packet per cycle) and queue size limits
229 // and implement flow control
230 returnQueue
.push(thePair
);
232 // if there is no set wakeup time, look through the queue
233 if (!tickEvent
.scheduled()) {
241 * receive a packet in functional mode
244 LdsState::CuSidePort::recvFunctional(PacketPtr pkt
)
246 fatal("not implemented");
250 * receive a retry for a response
253 LdsState::CuSidePort::recvRespRetry()
255 // TODO verify that this is the right way to do this
256 assert(ownerLds
->isRetryResp());
257 ownerLds
->setRetryResp(false);
265 LdsState::CuSidePort::recvRetry()
267 fatal("not implemented");
271 * look for packets to return at this time
276 Tick now
= clockEdge();
278 // send back completed packets
279 while (!returnQueue
.empty() && returnQueue
.front().first
<= now
) {
280 PacketPtr packet
= returnQueue
.front().second
;
282 ComputeUnit::LDSPort::SenderState
*ss
=
283 dynamic_cast<ComputeUnit::LDSPort::SenderState
*>(
284 packet
->senderState
);
286 GPUDynInstPtr gpuDynInst
= ss
->getMemInst();
288 gpuDynInst
->initiateAcc(gpuDynInst
);
290 packet
->makeTimingResponse();
294 bool success
= cuPort
.sendTimingResp(packet
);
298 panic("have not handled timing responses being NACK'd when sent"
303 // determine the next wakeup time
304 if (!returnQueue
.empty()) {
306 Tick next
= returnQueue
.front().first
;
308 if (tickEvent
.scheduled()) {
310 if (next
< tickEvent
.when()) {
312 tickEvent
.deschedule();
313 tickEvent
.schedule(next
);
316 tickEvent
.schedule(next
);
324 * wake up at this time and perform specified actions
327 LdsState::TickEvent::process()