8e7ba9ad589bc71d2359de4f430fc4c09ab7ba0f
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Steve Reinhardt
36 #include "gpu-compute/shader.hh"
40 #include "arch/x86/linux/linux.hh"
41 #include "base/chunk_generator.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/HSAIL.hh"
45 #include "gpu-compute/dispatcher.hh"
46 #include "gpu-compute/gpu_static_inst.hh"
47 #include "gpu-compute/qstruct.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
50 #include "mem/ruby/system/RubySystem.hh"
51 #include "sim/sim_exit.hh"
53 Shader::Shader(const Params
*p
)
54 : ClockedObject(p
), clock(p
->clk_domain
->clockPeriod()),
55 cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p
->cpu_pointer
),
56 tickEvent([this]{ processTick(); }, "Shader tick",
57 false, Event::CPU_Tick_Pri
),
58 timingSim(p
->timing
), hsail_mode(SIMT
),
59 impl_kern_boundary_sync(p
->impl_kern_boundary_sync
),
60 separate_acquire_release(p
->separate_acquire_release
), coissue_return(1),
61 trace_vgpr_all(1), n_cu((p
->CUs
).size()), n_wf(p
->n_wf
),
62 globalMemSize(p
->globalmem
), nextSchedCu(0), sa_n(0), tick_cnt(0),
63 box_tick_cnt(0), start_tick_cnt(0)
68 for (int i
= 0; i
< n_cu
; ++i
) {
69 cuList
[i
] = p
->CUs
[i
];
70 assert(i
== cuList
[i
]->cu_id
);
71 cuList
[i
]->shader
= this;
76 Shader::mmap(int length
)
81 // round up length to the next page
82 length
= roundUp(length
, TheISA::PageBytes
);
84 Process
*proc
= gpuTc
->getProcessPtr();
85 auto mem_state
= proc
->memState
;
87 if (proc
->mmapGrowsDown()) {
88 DPRINTF(HSAIL
, "GROWS DOWN");
89 start
= mem_state
->getMmapEnd() - length
;
90 mem_state
->setMmapEnd(start
);
92 DPRINTF(HSAIL
, "GROWS UP");
93 start
= mem_state
->getMmapEnd();
94 mem_state
->setMmapEnd(start
+ length
);
96 // assertion to make sure we don't overwrite the stack (it grows down)
97 assert(mem_state
->getStackBase() - mem_state
->getMaxStackSize() >
98 mem_state
->getMmapEnd());
101 DPRINTF(HSAIL
,"Shader::mmap start= %#x, %#x\n", start
, length
);
103 proc
->allocateMem(start
, length
);
111 // grab the threadContext of the thread running on the CPU
113 gpuTc
= cpuPointer
->getContext(0);
119 for (int j
= 0; j
< n_cu
; ++j
)
124 Shader::updateContext(int cid
) {
125 // context of the thread which dispatched work
127 gpuTc
= cpuPointer
->getContext(cid
);
132 Shader::hostWakeUp(BaseCPU
*cpu
) {
133 if (cpuPointer
== cpu
) {
134 if (gpuTc
->status() == ThreadContext::Suspended
)
135 cpu
->activateContext(gpuTc
->threadId());
137 //Make sure both dispatcher and shader are trying to
138 //wakeup same host. Hack here to enable kernel launch
140 panic("Dispatcher wants to wakeup a different host");
145 ShaderParams::create()
147 return new Shader(this);
153 tick_cnt
= curTick();
154 box_tick_cnt
= curTick() - start_tick_cnt
;
156 // apply any scheduled adds
157 for (int i
= 0; i
< sa_n
; ++i
) {
158 if (sa_when
[i
] <= tick_cnt
) {
159 *sa_val
[i
] += sa_x
[i
];
160 sa_val
.erase(sa_val
.begin() + i
);
161 sa_x
.erase(sa_x
.begin() + i
);
162 sa_when
.erase(sa_when
.begin() + i
);
168 // clock all of the cu's
169 for (int i
= 0; i
< n_cu
; ++i
)
174 Shader::dispatch_workgroups(NDRange
*ndr
)
176 bool scheduledSomething
= false;
178 int curCu
= nextSchedCu
;
180 while (cuCount
< n_cu
) {
181 //Every time we try a CU, update nextSchedCu
182 nextSchedCu
= (nextSchedCu
+ 1) % n_cu
;
184 // dispatch workgroup iff the following two conditions are met:
185 // (a) wg_rem is true - there are unassigned workgroups in the grid
186 // (b) there are enough free slots in cu cuList[i] for this wg
187 if (ndr
->wg_disp_rem
&& cuList
[curCu
]->ReadyWorkgroup(ndr
)) {
188 scheduledSomething
= true;
189 DPRINTF(GPUDisp
, "Dispatching a workgroup to CU %d\n", curCu
);
191 // ticks() member function translates cycles to simulation ticks.
192 if (!tickEvent
.scheduled()) {
193 schedule(tickEvent
, curTick() + this->ticks(1));
196 cuList
[curCu
]->StartWorkgroup(ndr
);
199 if (ndr
->wgId
[0] * ndr
->q
.wgSize
[0] >= ndr
->q
.gdSize
[0]) {
203 if (ndr
->wgId
[1] * ndr
->q
.wgSize
[1] >= ndr
->q
.gdSize
[1]) {
207 if (ndr
->wgId
[2] * ndr
->q
.wgSize
[2] >= ndr
->q
.gdSize
[2]) {
208 ndr
->wg_disp_rem
= false;
219 return scheduledSomething
;
223 Shader::handshake(GpuDispatcher
*_dispatcher
)
225 dispatcher
= _dispatcher
;
229 Shader::doFunctionalAccess(RequestPtr req
, MemCmd cmd
, void *data
,
230 bool suppress_func_errors
, int cu_id
)
232 int block_size
= cuList
.at(cu_id
)->cacheLineSize();
233 unsigned size
= req
->getSize();
236 BaseTLB::Mode trans_mode
;
238 if (cmd
== MemCmd::ReadReq
) {
239 trans_mode
= BaseTLB::Read
;
240 } else if (cmd
== MemCmd::WriteReq
) {
241 trans_mode
= BaseTLB::Write
;
243 fatal("unexcepted MemCmd\n");
246 tmp_addr
= req
->getVaddr();
247 Addr split_addr
= roundDown(tmp_addr
+ size
- 1, block_size
);
249 assert(split_addr
<= tmp_addr
|| split_addr
- tmp_addr
< block_size
);
252 if (split_addr
> tmp_addr
) {
253 RequestPtr req1
, req2
;
254 req
->splitOnVaddr(split_addr
, req1
, req2
);
257 PacketPtr pkt1
= new Packet(req2
, cmd
);
258 PacketPtr pkt2
= new Packet(req1
, cmd
);
260 functionalTLBAccess(pkt1
, cu_id
, trans_mode
);
261 functionalTLBAccess(pkt2
, cu_id
, trans_mode
);
263 PacketPtr new_pkt1
= new Packet(pkt1
->req
, cmd
);
264 PacketPtr new_pkt2
= new Packet(pkt2
->req
, cmd
);
266 new_pkt1
->dataStatic(data
);
267 new_pkt2
->dataStatic((uint8_t*)data
+ req1
->getSize());
269 if (suppress_func_errors
) {
270 new_pkt1
->setSuppressFuncError();
271 new_pkt2
->setSuppressFuncError();
274 // fixme: this should be cuList[cu_id] if cu_id != n_cu
275 // The latter requires a memPort in the dispatcher
276 cuList
[0]->memPort
[0]->sendFunctional(new_pkt1
);
277 cuList
[0]->memPort
[0]->sendFunctional(new_pkt2
);
284 PacketPtr pkt
= new Packet(req
, cmd
);
285 functionalTLBAccess(pkt
, cu_id
, trans_mode
);
286 PacketPtr new_pkt
= new Packet(pkt
->req
, cmd
);
287 new_pkt
->dataStatic(data
);
289 if (suppress_func_errors
) {
290 new_pkt
->setSuppressFuncError();
293 // fixme: this should be cuList[cu_id] if cu_id != n_cu
294 // The latter requires a memPort in the dispatcher
295 cuList
[0]->memPort
[0]->sendFunctional(new_pkt
);
305 for (int i_cu
= 0; i_cu
< n_cu
; ++i_cu
) {
306 if (!cuList
[i_cu
]->isDone()) {
315 Shader::ScheduleAdd(uint32_t *val
,Tick when
,int x
)
317 sa_val
.push_back(val
);
318 sa_when
.push_back(tick_cnt
+ when
);
325 Shader::processTick()
329 schedule(tickEvent
, curTick() + ticks(1));
334 Shader::AccessMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
335 MemCmd cmd
, bool suppress_func_errors
)
337 uint8_t *data_buf
= (uint8_t*)ptr
;
339 for (ChunkGenerator
gen(address
, size
, cuList
.at(cu_id
)->cacheLineSize());
340 !gen
.done(); gen
.next()) {
341 RequestPtr req
= new Request(0, gen
.addr(), gen
.size(), 0,
342 cuList
[0]->masterId(), 0, 0, 0);
344 doFunctionalAccess(req
, cmd
, data_buf
, suppress_func_errors
, cu_id
);
345 data_buf
+= gen
.size();
351 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
)
353 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, false);
357 Shader::ReadMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
358 bool suppress_func_errors
)
360 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::ReadReq
, suppress_func_errors
);
364 Shader::WriteMem(uint64_t address
, void *ptr
,uint32_t size
, int cu_id
)
366 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
, false);
370 Shader::WriteMem(uint64_t address
, void *ptr
, uint32_t size
, int cu_id
,
371 bool suppress_func_errors
)
373 AccessMem(address
, ptr
, size
, cu_id
, MemCmd::WriteReq
,
374 suppress_func_errors
);
378 * Send a packet through the appropriate TLB functional port.
379 * If cu_id=n_cu, then this is the dispatcher's TLB.
380 * Otherwise it's the TLB of the cu_id compute unit.
383 Shader::functionalTLBAccess(PacketPtr pkt
, int cu_id
, BaseTLB::Mode mode
)
385 // update senderState. Need to know the gpuTc and the TLB mode
387 new TheISA::GpuTLB::TranslationState(mode
, gpuTc
, false);
390 dispatcher
->tlbPort
->sendFunctional(pkt
);
392 // even when the perLaneTLB flag is turned on
393 // it's ok tp send all accesses through lane 0
394 // since the lane # is not known here,
395 // This isn't important since these are functional accesses.
396 cuList
[cu_id
]->tlbPort
[0]->sendFunctional(pkt
);
399 /* safe_cast the senderState */
400 TheISA::GpuTLB::TranslationState
*sender_state
=
401 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
403 delete sender_state
->tlbEntry
;
404 delete pkt
->senderState
;