2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Steve Reinhardt
36 #include "gpu-compute/shader.hh"
40 #include "arch/x86/linux/linux.hh"
41 #include "base/chunk_generator.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/HSAIL.hh"
45 #include "gpu-compute/dispatcher.hh"
46 #include "gpu-compute/gpu_static_inst.hh"
47 #include "gpu-compute/qstruct.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
50 #include "mem/ruby/system/RubySystem.hh"
51 #include "sim/sim_exit.hh"
53 Shader::Shader(const Params
*p
) : SimObject(p
),
54 clock(p
->clk_domain
->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
55 cpuPointer(p
->cpu_pointer
), tickEvent(this), timingSim(p
->timing
),
56 hsail_mode(SIMT
), impl_kern_boundary_sync(p
->impl_kern_boundary_sync
),
57 separate_acquire_release(p
->separate_acquire_release
), coissue_return(1),
58 trace_vgpr_all(1), n_cu((p
->CUs
).size()), n_wf(p
->n_wf
),
59 globalMemSize(p
->globalmem
), nextSchedCu(0), sa_n(0), tick_cnt(0),
60 box_tick_cnt(0), start_tick_cnt(0)
65 for (int i
= 0; i
< n_cu
; ++i
) {
66 cuList
[i
] = p
->CUs
[i
];
67 assert(i
== cuList
[i
]->cu_id
);
68 cuList
[i
]->shader
= this;
73 Shader::mmap(int length
)
78 // round up length to the next page
79 length
= roundUp(length
, TheISA::PageBytes
);
81 if (X86Linux64::mmapGrowsDown()) {
82 DPRINTF(HSAIL
, "GROWS DOWN");
83 start
= gpuTc
->getProcessPtr()->mmap_end
-length
;
84 gpuTc
->getProcessPtr()->mmap_end
= start
;
86 DPRINTF(HSAIL
, "GROWS UP");
87 start
= gpuTc
->getProcessPtr()->mmap_end
;
88 gpuTc
->getProcessPtr()->mmap_end
+= length
;
90 // assertion to make sure we don't overwrite the stack (it grows down)
91 assert(gpuTc
->getProcessPtr()->mmap_end
<
92 gpuTc
->getProcessPtr()->stack_base
-
93 gpuTc
->getProcessPtr()->max_stack_size
);
97 DPRINTF(HSAIL
,"Shader::mmap start= %#x, %#x\n", start
, length
);
99 gpuTc
->getProcessPtr()->allocateMem(start
,length
);
107 // grab the threadContext of the thread running on the CPU
109 gpuTc
= cpuPointer
->getContext(0);
115 for (int j
= 0; j
< n_cu
; ++j
)
120 Shader::updateThreadContext(int tid
) {
121 // thread context of the thread which dispatched work
123 gpuTc
= cpuPointer
->getContext(tid
);
128 Shader::hostWakeUp(BaseCPU
*cpu
) {
129 if (cpuPointer
== cpu
) {
130 if (gpuTc
->status() == ThreadContext::Suspended
)
131 cpu
->activateContext(gpuTc
->threadId());
133 //Make sure both dispatcher and shader are trying to
134 //wakeup same host. Hack here to enable kernel launch
136 panic("Dispatcher wants to wakeup a different host");
141 ShaderParams::create()
143 return new Shader(this);
149 tick_cnt
= curTick();
150 box_tick_cnt
= curTick() - start_tick_cnt
;
152 // apply any scheduled adds
153 for (int i
= 0; i
< sa_n
; ++i
) {
154 if (sa_when
[i
] <= tick_cnt
) {
155 *sa_val
[i
] += sa_x
[i
];
156 sa_val
.erase(sa_val
.begin() + i
);
157 sa_x
.erase(sa_x
.begin() + i
);
158 sa_when
.erase(sa_when
.begin() + i
);
164 // clock all of the cu's
165 for (int i
= 0; i
< n_cu
; ++i
)
170 Shader::dispatch_workgroups(NDRange
*ndr
)
172 bool scheduledSomething
= false;
174 int curCu
= nextSchedCu
;
176 while (cuCount
< n_cu
) {
177 //Every time we try a CU, update nextSchedCu
178 nextSchedCu
= (nextSchedCu
+ 1) % n_cu
;
180 // dispatch workgroup iff the following two conditions are met:
181 // (a) wg_rem is true - there are unassigned workgroups in the grid
182 // (b) there are enough free slots in cu cuList[i] for this wg
183 if (ndr
->wg_disp_rem
&& cuList
[curCu
]->ReadyWorkgroup(ndr
)) {
184 scheduledSomething
= true;
185 DPRINTF(GPUDisp
, "Dispatching a workgroup to CU %d\n", curCu
);
187 // ticks() member function translates cycles to simulation ticks.
188 if (!tickEvent
.scheduled()) {
189 schedule(tickEvent
, curTick() + this->ticks(1));
192 cuList
[curCu
]->StartWorkgroup(ndr
);
195 if (ndr
->wgId
[0] * ndr
->q
.wgSize
[0] >= ndr
->q
.gdSize
[0]) {
199 if (ndr
->wgId
[1] * ndr
->q
.wgSize
[1] >= ndr
->q
.gdSize
[1]) {
203 if (ndr
->wgId
[2] * ndr
->q
.wgSize
[2] >= ndr
->q
.gdSize
[2]) {
204 ndr
->wg_disp_rem
= false;
215 return scheduledSomething
;
219 Shader::handshake(GpuDispatcher
*_dispatcher
)
221 dispatcher
= _dispatcher
;
225 Shader::doFunctionalAccess(RequestPtr req
, MemCmd cmd
, void *data
,
226 bool suppress_func_errors
, int cu_id
)
228 unsigned block_size
= RubySystem::getBlockSizeBytes();
229 unsigned size
= req
->getSize();
232 BaseTLB::Mode trans_mode
;
234 if (cmd
== MemCmd::ReadReq
) {
235 trans_mode
= BaseTLB::Read
;
236 } else if (cmd
== MemCmd::WriteReq
) {
237 trans_mode
= BaseTLB::Write
;
239 fatal("unexcepted MemCmd\n");
242 tmp_addr
= req
->getVaddr();
243 Addr split_addr
= roundDown(tmp_addr
+ size
- 1, block_size
);
245 assert(split_addr
<= tmp_addr
|| split_addr
- tmp_addr
< block_size
);
248 if (split_addr
> tmp_addr
) {
249 RequestPtr req1
, req2
;
250 req
->splitOnVaddr(split_addr
, req1
, req2
);
253 PacketPtr pkt1
= new Packet(req2
, cmd
);
254 PacketPtr pkt2
= new Packet(req1
, cmd
);
256 functionalTLBAccess(pkt1
, cu_id
, trans_mode
);
257 functionalTLBAccess(pkt2
, cu_id
, trans_mode
);
259 PacketPtr new_pkt1
= new Packet(pkt1
->req
, cmd
);
260 PacketPtr new_pkt2
= new Packet(pkt2
->req
, cmd
);
262 new_pkt1
->dataStatic(data
);
263 new_pkt2
->dataStatic((uint8_t*)data
+ req1
->getSize());
265 if (suppress_func_errors
) {
266 new_pkt1
->setSuppressFuncError();
267 new_pkt2
->setSuppressFuncError();
270 // fixme: this should be cuList[cu_id] if cu_id != n_cu
271 // The latter requires a memPort in the dispatcher
272 cuList
[0]->memPort
[0]->sendFunctional(new_pkt1
);
273 cuList
[0]->memPort
[0]->sendFunctional(new_pkt2
);
280 PacketPtr pkt
= new Packet(req
, cmd
);
281 functionalTLBAccess(pkt
, cu_id
, trans_mode
);
282 PacketPtr new_pkt
= new Packet(pkt
->req
, cmd
);
283 new_pkt
->dataStatic(data
);
285 if (suppress_func_errors
) {
286 new_pkt
->setSuppressFuncError();
289 // fixme: this should be cuList[cu_id] if cu_id != n_cu
290 // The latter requires a memPort in the dispatcher
291 cuList
[0]->memPort
[0]->sendFunctional(new_pkt
);
301 for (int i_cu
= 0; i_cu
< n_cu
; ++i_cu
) {
302 if (!cuList
[i_cu
]->isDone()) {
311 Shader::ScheduleAdd(uint32_t *val
,Tick when
,int x
)
313 sa_val
.push_back(val
);
314 sa_when
.push_back(tick_cnt
+ when
);
319 Shader::TickEvent::TickEvent(Shader
*_shader
)
320 : Event(CPU_Tick_Pri
), shader(_shader
)
326 Shader::TickEvent::process()
328 if (shader
->busy()) {
330 shader
->schedule(this, curTick() + shader
->ticks(1));
335 Shader::TickEvent::description() const
337 return "Shader tick";
341 Shader::AccessMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
342 MemCmd cmd
, bool suppress_func_errors
)
344 uint8_t *data_buf
= (uint8_t*)ptr
;
346 for (ChunkGenerator
gen(address
, size
, RubySystem::getBlockSizeBytes());
347 !gen
.done(); gen
.next()) {
348 Request
*req
= new Request(0, gen
.addr(), gen
.size(), 0,
349 cuList
[0]->masterId(), 0, 0, 0);
351 doFunctionalAccess(req
, cmd
, data_buf
, suppress_func_errors
, cu_id
);
352 data_buf
+= gen
.size();
358 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
)
360 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, false);
364 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
365 bool suppress_func_errors
)
367 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, suppress_func_errors
);
371 Shader::WriteMem(uint64_t address
, void *ptr
,uint32_t size
, int cu_id
)
373 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
, false);
377 Shader::WriteMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
378 bool suppress_func_errors
)
380 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
,
381 suppress_func_errors
);
385 * Send a packet through the appropriate TLB functional port.
386 * If cu_id=n_cu, then this is the dispatcher's TLB.
387 * Otherwise it's the TLB of the cu_id compute unit.
390 Shader::functionalTLBAccess(PacketPtr pkt
, int cu_id
, BaseTLB::Mode mode
)
392 // update senderState. Need to know the gpuTc and the TLB mode
394 new TheISA::GpuTLB::TranslationState(mode
, gpuTc
, false);
397 dispatcher
->tlbPort
->sendFunctional(pkt
);
399 // even when the perLaneTLB flag is turned on
400 // it's ok tp send all accesses through lane 0
401 // since the lane # is not known here,
402 // This isn't important since these are functional accesses.
403 cuList
[cu_id
]->tlbPort
[0]->sendFunctional(pkt
);
406 /* safe_cast the senderState */
407 TheISA::GpuTLB::TranslationState
*sender_state
=
408 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
410 delete sender_state
->tlbEntry
;
411 delete pkt
->senderState
;