}
void
-HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
{
RQLEntry *queue = regdQList[rl_idx];
- if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) {
- Tick processingTick = curTick() + pktProcessDelay;
+ if (!queue->aqlProcessEvent.scheduled()) {
+ Tick processingTick = curTick() + delay;
schedule(queue->aqlProcessEvent, processingTick);
DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
processingTick);
}
}
-bool
+void
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+{
+ schedAQLProcessing(rl_idx, pktProcessDelay);
+}
+
+Q_STATE
HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
{
- bool is_submitted = false;
+ Q_STATE is_submitted = BLOCKED_BPKT;
SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
// Dependency signals are not read yet. And this can only be a retry.
// The retry logic will schedule the packet processor wakeup
if (dep_sgnl_rd_st->pendingReads != 0) {
- return false;
+ return BLOCKED_BPKT;
}
// `pkt` can be typecasted to any type of AQL packet since they all
// have header information at offset zero
auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
+ if (IS_BARRIER(disp_pkt) &&
+ regdQList[rl_idx]->compltnPending() > 0) {
+ // If this packet is using the "barrier bit" to enforce ordering with
+ // previous packets, and if there are outstanding packets, set the
+ // barrier bit for this queue and block the queue.
+ DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+ " list ID = %d\n", __FUNCTION__, rl_idx);
+ regdQList[rl_idx]->setBarrierBit(true);
+ return BLOCKED_BBIT;
+ }
if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
- is_submitted = true;
+ is_submitted = UNBLOCKED;
} else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
- is_submitted = true;
- /*
- If this packet is using the "barrier bit" to enforce ordering with
- subsequent kernels, set the bit for this queue now, after
- dispatching.
- */
- if (IS_BARRIER(disp_pkt)) {
- DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
- " list ID = %d\n", __FUNCTION__, rl_idx);
- regdQList[rl_idx]->setBarrierBit(true);
- }
+ is_submitted = UNBLOCKED;
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// TODO: Completion signal of barrier packet to be
// atomically decremented here
finishPkt((void*)bar_and_pkt, rl_idx);
- is_submitted = true;
+ is_submitted = UNBLOCKED;
// Reset signal values
dep_sgnl_rd_st->resetSigVals();
// The completion signal is connected
" dispIdx %d, active list ID = %d\n",
__FUNCTION__, aqlRingBuffer->rdIdx(),
aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
+ // If barrier bit is set, then this wakeup is a dummy wakeup
+ // just to model the processing time. Do nothing.
+ if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
+ DPRINTF(HSAPacketProcessor,
+ "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
+ return;
+ }
// In the future, we may support batch processing of packets.
// Then, we can just remove the break statements and the code
// will support batch processing. That is why we are using a
DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
Addr host_addr = aqlRingBuffer->hostDispAddr();
- if (hsaPP->processPkt(pkt, rqIdx, host_addr)) {
+ Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
+ if (q_state == UNBLOCKED) {
aqlRingBuffer->incDispIdx(1);
DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
hsaPP->schedAQLProcessing(rqIdx);
}
break;
- } else {
- // This queue is blocked, scheduled a processing event
+ } else if (q_state == BLOCKED_BPKT) {
+ // This queue is blocked by barrier packet,
+ // schedule a processing event
+ hsaPP->schedAQLProcessing(rqIdx);
+ break;
+ } else if (q_state == BLOCKED_BBIT) {
+ // This queue is blocked by barrier bit, and processing event
+ // should be scheduled from finishPkt(). However, to elapse
+ // "pktProcessDelay" processing time, let us schedule a dummy
+ // wakeup once which will just wakeup and will do nothing.
hsaPP->schedAQLProcessing(rqIdx);
break;
+ } else {
+ panic("Unknown queue state\n");
}
}
}
{
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
- // if barrier bit was set, unset it here -- we assume that finishPkt is
- // only called after the completion of a kernel
- if (regdQList[rl_idx]->getBarrierBit()) {
+ // if barrier bit was set and this is the last
+ // outstanding packet from that queue,
+ // unset it here
+ if (regdQList[rl_idx]->getBarrierBit() &&
+ regdQList[rl_idx]->isLastOutstandingPkt()) {
DPRINTF(HSAPacketProcessor,
"Unset barrier bit for active list ID %d\n", rl_idx);
regdQList[rl_idx]->setBarrierBit(false);
- // if pending kernels in the queue after this kernel, reschedule
- if (regdQList[rl_idx]->dispPending()) {
- DPRINTF(HSAPacketProcessor,
- "Rescheduling active list ID %d after unsetting barrier "
- "bit\n", rl_idx);
- schedAQLProcessing(rl_idx);
- }
+ panic_if(!regdQList[rl_idx]->dispPending(),
+ "There should be pending kernels in this queue\n");
+ DPRINTF(HSAPacketProcessor,
+ "Rescheduling active list ID %d after unsetting barrier "
+ "bit\n", rl_idx);
+ // Try to schedule wakeup in the next cycle. There is a minimum
+ // pktProcessDelay for queue wake up. If that processing delay is
+ // elapsed, schedAQLProcessing will wakeup next tick.
+ schedAQLProcessing(rl_idx, 1);
}
// If set, then blocked schedule, so need to reschedule
// HSA runtime supports only 5 signals per barrier packet
#define NumSignalsPerBarrier 5
+// Ideally, each queue should store this status and
+// the processPkt() should make decisions based on that
+// status variable.
+typedef enum {
+ UNBLOCKED = 0, // Unblocked queue, can submit packets.
+ BLOCKED_BBIT, // Queue blocked by barrier bit.
+ // Can submit packet packets after
+ // previous packet completes.
+ BLOCKED_BPKT, // Queue blocked by barrier packet.
+ // Can submit packet packets after
+ // barrier packet completes.
+} Q_STATE;
+
class HSADevice;
class HWScheduler;
return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
}
+ /**
+ * Packets aren't guaranteed to be completed in-order, and we need
+ * to know when the last packet is finished in order to un-set
+ * the barrier bit. In order to confirm if the packet at _rdIdx
+ * is the last packet, we check if the packets ahead of _rdIdx
+ * are finished. If they are, _rdIdx is the last packet. If not,
+ * there are other outstanding packets.
+ */
+ bool
+ isLastOutstandingPkt() const
+ {
+ for (int i = _rdIdx + 1; i < _dispIdx; i++) {
+ if (!_aqlComplete[i % _aqlBuf.size()]) {
+ return false;
+ }
+ }
+ return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
+ }
+
uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
uint32_t numObjs() const { return _aqlBuf.size(); };
void incRdIdx(uint64_t value) { _rdIdx += value; }
void incWrIdx(uint64_t value) { _wrIdx += value; }
void incDispIdx(uint64_t value) { _dispIdx += value; }
-
+ uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
};
typedef struct QueueContext {
: aqlProcessEvent(hsaPP, rqIdx) {}
QCntxt qCntxt;
bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
+ uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
SignalState depSignalRdState;
QueueProcessEvent aqlProcessEvent;
void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
bool getBarrierBit() const { return qCntxt.barrierBit; }
+ bool isLastOutstandingPkt() const
+ {
+ return qCntxt.aqlBuf->isLastOutstandingPkt();
+ }
};
// Keeps track of queueDescriptors of registered queues
std::vector<class RQLEntry *> regdQList;
void dmaWriteVirt(Addr host_addr, unsigned size, Event *event,
void *data, Tick delay = 0);
- bool processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
+ Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
void displayQueueDescriptor(int pid, uint32_t rl_idx);
public:
void finishPkt(void *pkt, uint32_t rl_idx);
void finishPkt(void *pkt) { finishPkt(pkt, 0); }
void schedAQLProcessing(uint32_t rl_idx);
+ void schedAQLProcessing(uint32_t rl_idx, Tick delay);
class DepSignalsReadDmaEvent : public Event
{