2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Steve Reinhardt
36 #include "gpu-compute/shader.hh"
40 #include "arch/x86/linux/linux.hh"
41 #include "base/chunk_generator.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/HSAIL.hh"
45 #include "gpu-compute/dispatcher.hh"
46 #include "gpu-compute/gpu_static_inst.hh"
47 #include "gpu-compute/qstruct.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
50 #include "mem/ruby/system/RubySystem.hh"
51 #include "sim/sim_exit.hh"
53 Shader::Shader(const Params
*p
) : SimObject(p
),
54 clock(p
->clk_domain
->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
55 cpuPointer(p
->cpu_pointer
), tickEvent(this), timingSim(p
->timing
),
56 hsail_mode(SIMT
), impl_kern_boundary_sync(p
->impl_kern_boundary_sync
),
57 separate_acquire_release(p
->separate_acquire_release
), coissue_return(1),
58 trace_vgpr_all(1), n_cu((p
->CUs
).size()), n_wf(p
->n_wf
),
59 globalMemSize(p
->globalmem
), nextSchedCu(0), sa_n(0), tick_cnt(0),
60 box_tick_cnt(0), start_tick_cnt(0)
65 for (int i
= 0; i
< n_cu
; ++i
) {
66 cuList
[i
] = p
->CUs
[i
];
67 assert(i
== cuList
[i
]->cu_id
);
68 cuList
[i
]->shader
= this;
73 Shader::mmap(int length
)
78 // round up length to the next page
79 length
= roundUp(length
, TheISA::PageBytes
);
81 Process
*proc
= gpuTc
->getProcessPtr();
83 if (proc
->mmapGrowsDown()) {
84 DPRINTF(HSAIL
, "GROWS DOWN");
85 start
= proc
->mmap_end
- length
;
86 proc
->mmap_end
= start
;
88 DPRINTF(HSAIL
, "GROWS UP");
89 start
= proc
->mmap_end
;
90 proc
->mmap_end
+= length
;
92 // assertion to make sure we don't overwrite the stack (it grows down)
93 assert(proc
->mmap_end
< proc
->stack_base
- proc
->max_stack_size
);
96 DPRINTF(HSAIL
,"Shader::mmap start= %#x, %#x\n", start
, length
);
98 proc
->allocateMem(start
, length
);
106 // grab the threadContext of the thread running on the CPU
108 gpuTc
= cpuPointer
->getContext(0);
114 for (int j
= 0; j
< n_cu
; ++j
)
119 Shader::updateContext(int cid
) {
120 // context of the thread which dispatched work
122 gpuTc
= cpuPointer
->getContext(cid
);
127 Shader::hostWakeUp(BaseCPU
*cpu
) {
128 if (cpuPointer
== cpu
) {
129 if (gpuTc
->status() == ThreadContext::Suspended
)
130 cpu
->activateContext(gpuTc
->threadId());
132 //Make sure both dispatcher and shader are trying to
133 //wakeup same host. Hack here to enable kernel launch
135 panic("Dispatcher wants to wakeup a different host");
140 ShaderParams::create()
142 return new Shader(this);
148 tick_cnt
= curTick();
149 box_tick_cnt
= curTick() - start_tick_cnt
;
151 // apply any scheduled adds
152 for (int i
= 0; i
< sa_n
; ++i
) {
153 if (sa_when
[i
] <= tick_cnt
) {
154 *sa_val
[i
] += sa_x
[i
];
155 sa_val
.erase(sa_val
.begin() + i
);
156 sa_x
.erase(sa_x
.begin() + i
);
157 sa_when
.erase(sa_when
.begin() + i
);
163 // clock all of the cu's
164 for (int i
= 0; i
< n_cu
; ++i
)
169 Shader::dispatch_workgroups(NDRange
*ndr
)
171 bool scheduledSomething
= false;
173 int curCu
= nextSchedCu
;
175 while (cuCount
< n_cu
) {
176 //Every time we try a CU, update nextSchedCu
177 nextSchedCu
= (nextSchedCu
+ 1) % n_cu
;
179 // dispatch workgroup iff the following two conditions are met:
180 // (a) wg_rem is true - there are unassigned workgroups in the grid
181 // (b) there are enough free slots in cu cuList[i] for this wg
182 if (ndr
->wg_disp_rem
&& cuList
[curCu
]->ReadyWorkgroup(ndr
)) {
183 scheduledSomething
= true;
184 DPRINTF(GPUDisp
, "Dispatching a workgroup to CU %d\n", curCu
);
186 // ticks() member function translates cycles to simulation ticks.
187 if (!tickEvent
.scheduled()) {
188 schedule(tickEvent
, curTick() + this->ticks(1));
191 cuList
[curCu
]->StartWorkgroup(ndr
);
194 if (ndr
->wgId
[0] * ndr
->q
.wgSize
[0] >= ndr
->q
.gdSize
[0]) {
198 if (ndr
->wgId
[1] * ndr
->q
.wgSize
[1] >= ndr
->q
.gdSize
[1]) {
202 if (ndr
->wgId
[2] * ndr
->q
.wgSize
[2] >= ndr
->q
.gdSize
[2]) {
203 ndr
->wg_disp_rem
= false;
214 return scheduledSomething
;
218 Shader::handshake(GpuDispatcher
*_dispatcher
)
220 dispatcher
= _dispatcher
;
224 Shader::doFunctionalAccess(RequestPtr req
, MemCmd cmd
, void *data
,
225 bool suppress_func_errors
, int cu_id
)
227 int block_size
= cuList
.at(cu_id
)->cacheLineSize();
228 unsigned size
= req
->getSize();
231 BaseTLB::Mode trans_mode
;
233 if (cmd
== MemCmd::ReadReq
) {
234 trans_mode
= BaseTLB::Read
;
235 } else if (cmd
== MemCmd::WriteReq
) {
236 trans_mode
= BaseTLB::Write
;
238 fatal("unexcepted MemCmd\n");
241 tmp_addr
= req
->getVaddr();
242 Addr split_addr
= roundDown(tmp_addr
+ size
- 1, block_size
);
244 assert(split_addr
<= tmp_addr
|| split_addr
- tmp_addr
< block_size
);
247 if (split_addr
> tmp_addr
) {
248 RequestPtr req1
, req2
;
249 req
->splitOnVaddr(split_addr
, req1
, req2
);
252 PacketPtr pkt1
= new Packet(req2
, cmd
);
253 PacketPtr pkt2
= new Packet(req1
, cmd
);
255 functionalTLBAccess(pkt1
, cu_id
, trans_mode
);
256 functionalTLBAccess(pkt2
, cu_id
, trans_mode
);
258 PacketPtr new_pkt1
= new Packet(pkt1
->req
, cmd
);
259 PacketPtr new_pkt2
= new Packet(pkt2
->req
, cmd
);
261 new_pkt1
->dataStatic(data
);
262 new_pkt2
->dataStatic((uint8_t*)data
+ req1
->getSize());
264 if (suppress_func_errors
) {
265 new_pkt1
->setSuppressFuncError();
266 new_pkt2
->setSuppressFuncError();
269 // fixme: this should be cuList[cu_id] if cu_id != n_cu
270 // The latter requires a memPort in the dispatcher
271 cuList
[0]->memPort
[0]->sendFunctional(new_pkt1
);
272 cuList
[0]->memPort
[0]->sendFunctional(new_pkt2
);
279 PacketPtr pkt
= new Packet(req
, cmd
);
280 functionalTLBAccess(pkt
, cu_id
, trans_mode
);
281 PacketPtr new_pkt
= new Packet(pkt
->req
, cmd
);
282 new_pkt
->dataStatic(data
);
284 if (suppress_func_errors
) {
285 new_pkt
->setSuppressFuncError();
288 // fixme: this should be cuList[cu_id] if cu_id != n_cu
289 // The latter requires a memPort in the dispatcher
290 cuList
[0]->memPort
[0]->sendFunctional(new_pkt
);
300 for (int i_cu
= 0; i_cu
< n_cu
; ++i_cu
) {
301 if (!cuList
[i_cu
]->isDone()) {
310 Shader::ScheduleAdd(uint32_t *val
,Tick when
,int x
)
312 sa_val
.push_back(val
);
313 sa_when
.push_back(tick_cnt
+ when
);
318 Shader::TickEvent::TickEvent(Shader
*_shader
)
319 : Event(CPU_Tick_Pri
), shader(_shader
)
325 Shader::TickEvent::process()
327 if (shader
->busy()) {
329 shader
->schedule(this, curTick() + shader
->ticks(1));
334 Shader::TickEvent::description() const
336 return "Shader tick";
340 Shader::AccessMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
341 MemCmd cmd
, bool suppress_func_errors
)
343 uint8_t *data_buf
= (uint8_t*)ptr
;
345 for (ChunkGenerator
gen(address
, size
, cuList
.at(cu_id
)->cacheLineSize());
346 !gen
.done(); gen
.next()) {
347 Request
*req
= new Request(0, gen
.addr(), gen
.size(), 0,
348 cuList
[0]->masterId(), 0, 0, 0);
350 doFunctionalAccess(req
, cmd
, data_buf
, suppress_func_errors
, cu_id
);
351 data_buf
+= gen
.size();
357 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
)
359 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, false);
363 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
364 bool suppress_func_errors
)
366 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, suppress_func_errors
);
370 Shader::WriteMem(uint64_t address
, void *ptr
,uint32_t size
, int cu_id
)
372 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
, false);
376 Shader::WriteMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
377 bool suppress_func_errors
)
379 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
,
380 suppress_func_errors
);
384 * Send a packet through the appropriate TLB functional port.
385 * If cu_id=n_cu, then this is the dispatcher's TLB.
386 * Otherwise it's the TLB of the cu_id compute unit.
389 Shader::functionalTLBAccess(PacketPtr pkt
, int cu_id
, BaseTLB::Mode mode
)
391 // update senderState. Need to know the gpuTc and the TLB mode
393 new TheISA::GpuTLB::TranslationState(mode
, gpuTc
, false);
396 dispatcher
->tlbPort
->sendFunctional(pkt
);
398 // even when the perLaneTLB flag is turned on
399 // it's ok tp send all accesses through lane 0
400 // since the lane # is not known here,
401 // This isn't important since these are functional accesses.
402 cuList
[cu_id
]->tlbPort
[0]->sendFunctional(pkt
);
405 /* safe_cast the senderState */
406 TheISA::GpuTLB::TranslationState
*sender_state
=
407 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
409 delete sender_state
->tlbEntry
;
410 delete pkt
->senderState
;