2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Brad Beckmann,
39 #include "gpu-compute/dispatcher.hh"
41 #include "cpu/base.hh"
42 #include "debug/GPUDisp.hh"
43 #include "gpu-compute/cl_driver.hh"
44 #include "gpu-compute/cl_event.hh"
45 #include "gpu-compute/shader.hh"
46 #include "gpu-compute/wavefront.hh"
47 #include "mem/packet_access.hh"
49 GpuDispatcher
*GpuDispatcher::instance
= nullptr;
51 GpuDispatcher::GpuDispatcher(const Params
*p
)
52 : DmaDevice(p
), _masterId(p
->system
->getMasterId(this, "disp")),
53 pioAddr(p
->pio_addr
), pioSize(4096), pioDelay(p
->pio_latency
),
54 dispatchCount(0), dispatchActive(false), cpu(p
->cpu
),
55 shader(p
->shader_pointer
), driver(p
->cl_driver
),
56 tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
57 false, Event::CPU_Tick_Pri
)
59 shader
->handshake(this);
60 driver
->handshake(this);
62 ndRange
.wg_disp_rem
= false;
63 ndRange
.globalWgId
= 0;
65 schedule(&tickEvent
, 0);
67 // translation port for the dispatcher
68 tlbPort
= new TLBPort(csprintf("%s-port%d", name()), this);
71 .name(name() + ".num_kernel_launched")
72 .desc("number of kernel launched")
76 GpuDispatcher
*GpuDispatcherParams::create()
78 GpuDispatcher
*dispatcher
= new GpuDispatcher(this);
79 GpuDispatcher::setInstance(dispatcher
);
81 return GpuDispatcher::getInstance();
85 GpuDispatcher::serialize(CheckpointOut
&cp
) const
89 if (ndRange
.wg_disp_rem
)
90 fatal("Checkpointing not supported during active workgroup execution");
92 if (tickEvent
.scheduled())
93 event_tick
= tickEvent
.when();
95 SERIALIZE_SCALAR(event_tick
);
100 GpuDispatcher::unserialize(CheckpointIn
&cp
)
104 if (tickEvent
.scheduled())
105 deschedule(&tickEvent
);
107 UNSERIALIZE_SCALAR(event_tick
);
110 schedule(&tickEvent
, event_tick
);
114 GpuDispatcher::getAddrRanges() const
116 AddrRangeList ranges
;
118 DPRINTF(GPUDisp
, "dispatcher registering addr range at %#x size %#x\n",
121 ranges
.push_back(RangeSize(pioAddr
, pioSize
));
127 GpuDispatcher::read(PacketPtr pkt
)
129 assert(pkt
->getAddr() >= pioAddr
);
130 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
132 int offset
= pkt
->getAddr() - pioAddr
;
135 DPRINTF(GPUDisp
, " read register %#x size=%d\n", offset
, pkt
->getSize());
139 assert(pkt
->getSize() == 8);
141 uint64_t retval
= dispatchActive
;
145 assert(offset
+ pkt
->getSize() < sizeof(HsaQueueEntry
));
146 char *curTaskPtr
= (char*)&curTask
;
148 memcpy(pkt
->getPtr
<const void*>(), curTaskPtr
+ offset
, pkt
->getSize());
151 pkt
->makeAtomicResponse();
157 GpuDispatcher::write(PacketPtr pkt
)
159 assert(pkt
->getAddr() >= pioAddr
);
160 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
162 int offset
= pkt
->getAddr() - pioAddr
;
165 uint64_t data_val
= 0;
167 switch (pkt
->getSize()) {
169 data_val
= pkt
->getLE
<uint8_t>();
172 data_val
= pkt
->getLE
<uint16_t>();
175 data_val
= pkt
->getLE
<uint32_t>();
178 data_val
= pkt
->getLE
<uint64_t>();
181 DPRINTF(GPUDisp
, "bad size %d\n", pkt
->getSize());
184 DPRINTF(GPUDisp
, "write register %#x value %#x size=%d\n", offset
, data_val
,
188 static int nextId
= 0;
190 // The depends field of the qstruct, which was previously unused, is
191 // used to communicate with simulated application.
192 if (curTask
.depends
) {
194 shader
->ReadMem((uint64_t)(curTask
.depends
), &hs
,
195 sizeof(HostState
), 0);
197 // update event start time (in nano-seconds)
198 uint64_t start
= curTick() / 1000;
200 shader
->WriteMem((uint64_t)(&((_cl_event
*)hs
.event
)->start
),
201 &start
, sizeof(uint64_t), 0);
205 ++num_kernelLaunched
;
207 NDRange
*ndr
= &(ndRangeMap
[nextId
]);
208 // copy dispatch info
211 // update the numDispTask polled by the runtime
212 accessUserVar(cpu
, (uint64_t)(curTask
.numDispLeft
), 0, 1);
216 for (int i
= 0; i
< 3; ++i
) {
218 ndr
->numWg
[i
] = divCeil(curTask
.gdSize
[i
], curTask
.wgSize
[i
]);
219 ndr
->numWgTotal
*= ndr
->numWg
[i
];
222 ndr
->numWgCompleted
= 0;
224 ndr
->wg_disp_rem
= true;
225 ndr
->execDone
= false;
226 ndr
->addrToNotify
= (volatile bool*)curTask
.addrToNotify
;
227 ndr
->numDispLeft
= (volatile uint32_t*)curTask
.numDispLeft
;
228 ndr
->dispatchId
= nextId
;
229 ndr
->curCid
= pkt
->req
->contextId();
230 DPRINTF(GPUDisp
, "launching kernel %d\n",nextId
);
231 execIds
.push(nextId
);
234 dispatchActive
= true;
236 if (!tickEvent
.scheduled()) {
237 schedule(&tickEvent
, curTick() + shader
->ticks(1));
240 // populate current task struct
241 // first 64 bits are launch reg
243 assert(offset
< sizeof(HsaQueueEntry
));
244 char *curTaskPtr
= (char*)&curTask
;
245 memcpy(curTaskPtr
+ offset
, pkt
->getPtr
<const void*>(), pkt
->getSize());
248 pkt
->makeAtomicResponse();
255 GpuDispatcher::getPort(const std::string
&if_name
, PortID idx
)
257 if (if_name
== "translation_port") {
261 return DmaDevice::getPort(if_name
, idx
);
265 GpuDispatcher::exec()
269 // There are potentially multiple outstanding kernel launches.
270 // It is possible that the workgroups in a different kernel
271 // can fit on the GPU even if another kernel's workgroups cannot
272 DPRINTF(GPUDisp
, "Launching %d Kernels\n", execIds
.size());
274 while (execIds
.size() > fail_count
) {
275 int execId
= execIds
.front();
277 while (ndRangeMap
[execId
].wg_disp_rem
) {
278 //update the thread context
279 shader
->updateContext(ndRangeMap
[execId
].curCid
);
281 // attempt to dispatch_workgroup
282 if (!shader
->dispatch_workgroups(&ndRangeMap
[execId
])) {
283 // if we failed try the next kernel,
284 // it may have smaller workgroups.
285 // put it on the queue to rety latter
286 DPRINTF(GPUDisp
, "kernel %d failed to launch\n", execId
);
287 execIds
.push(execId
);
292 // let's try the next kernel_id
296 DPRINTF(GPUDisp
, "Returning %d Kernels\n", doneIds
.size());
298 if (doneIds
.size() && cpu
) {
299 shader
->hostWakeUp(cpu
);
302 while (doneIds
.size()) {
303 // wakeup the CPU if any Kernels completed this cycle
304 DPRINTF(GPUDisp
, "WorkGroup %d completed\n", doneIds
.front());
310 GpuDispatcher::notifyWgCompl(Wavefront
*w
)
312 int kern_id
= w
->kernId
;
313 DPRINTF(GPUDisp
, "notify WgCompl %d\n",kern_id
);
314 assert(ndRangeMap
[kern_id
].dispatchId
== kern_id
);
315 ndRangeMap
[kern_id
].numWgCompleted
++;
317 if (ndRangeMap
[kern_id
].numWgCompleted
== ndRangeMap
[kern_id
].numWgTotal
) {
318 ndRangeMap
[kern_id
].execDone
= true;
319 doneIds
.push(kern_id
);
321 if (ndRangeMap
[kern_id
].addrToNotify
) {
322 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].addrToNotify
), 1,
326 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].numDispLeft
), 0, -1);
328 // update event end time (in nano-seconds)
329 if (ndRangeMap
[kern_id
].q
.depends
) {
330 HostState
*host_state
= (HostState
*)ndRangeMap
[kern_id
].q
.depends
;
332 shader
->ReadMem((uint64_t)(&host_state
->event
), &event
,
333 sizeof(uint64_t), 0);
335 uint64_t end
= curTick() / 1000;
337 shader
->WriteMem((uint64_t)(&((_cl_event
*)event
)->end
), &end
,
338 sizeof(uint64_t), 0);
342 if (!tickEvent
.scheduled()) {
343 schedule(&tickEvent
, curTick() + shader
->ticks(1));
348 GpuDispatcher::scheduleDispatch()
350 if (!tickEvent
.scheduled())
351 schedule(&tickEvent
, curTick() + shader
->ticks(1));
355 GpuDispatcher::accessUserVar(BaseCPU
*cpu
, uint64_t addr
, int val
, int off
)
359 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::ReadReq
,
364 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::WriteReq
, true);
366 panic("Cannot find host");
370 // helper functions for driver to retrieve GPU attributes
372 GpuDispatcher::getNumCUs()
374 return shader
->cuList
.size();
378 GpuDispatcher::wfSize() const
380 return shader
->cuList
[0]->wfSize();
384 GpuDispatcher::setFuncargsSize(int funcargs_size
)
386 shader
->funcargs_size
= funcargs_size
;
390 GpuDispatcher::getStaticContextSize() const
392 return shader
->cuList
[0]->wfList
[0][0]->getStaticContextSize();