dev: add support for HSA's barrier bit kernel synchronization
authorMatt Sinclair <Matthew.Sinclair@amd.com>
Thu, 24 May 2018 18:02:13 +0000 (14:02 -0400)
committerAnthony Gutierrez <anthony.gutierrez@amd.com>
Mon, 22 Jun 2020 16:14:35 +0000 (16:14 +0000)
This commit adds support for the HSA's barrier bit version of
synchronization.  This method of synchronization is used for all
HIP benchmarks, and thus is necessary to ensure that multiple
kernels from the same queue are synchronizing properly.

Change-Id: I64f2d311a3970b71194e0555e2b932800df65e98
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29925
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
src/dev/hsa/hsa_packet_processor.cc
src/dev/hsa/hsa_packet_processor.hh

index f9880e40ecef2ce27ca9ae639c432007ad3cdf77..41430191ea976a85b66933c48b8cda6dae7346c1 100644 (file)
 #define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
             HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1)))
 
+// checks if the barrier bit is set in the header -- shift the barrier bit
+// to LSB, then bitwise "and" to mask off all other bits
+#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
+            HSA_PACKET_HEADER_BARRIER) & HSA_PACKET_HEADER_WIDTH_BARRIER))
+
 HSAPP_EVENT_DESCRIPTION_GENERATOR(UpdateReadDispIdDmaEvent)
 HSAPP_EVENT_DESCRIPTION_GENERATOR(CmdQueueCmdDmaEvent)
 HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
@@ -280,7 +285,7 @@ void
 HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
 {
     RQLEntry *queue = regdQList[rl_idx];
-    if (!queue->aqlProcessEvent.scheduled()) {
+    if (!queue->aqlProcessEvent.scheduled() && !queue->getBarrierBit()) {
         Tick processingTick = curTick() + pktProcessDelay;
         schedule(queue->aqlProcessEvent, processingTick);
         DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
@@ -316,6 +321,16 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
         // Submit packet to HSA device (dispatcher)
         hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
         is_submitted = true;
+        /*
+          If this packet is using the "barrier bit" to enforce ordering with
+          subsequent kernels, set the bit for this queue now, after
+          dispatching.
+        */
+        if (IS_BARRIER(disp_pkt)) {
+            DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+                    " list ID = %d\n", __FUNCTION__, rl_idx);
+            regdQList[rl_idx]->setBarrierBit(true);
+        }
     } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
         DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
                 " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -631,6 +646,23 @@ void
 HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
 {
     HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+
+    // if barrier bit was set, unset it here -- we assume that finishPkt is
+    // only called after the completion of a kernel
+    if (regdQList[rl_idx]->getBarrierBit()) {
+        DPRINTF(HSAPacketProcessor,
+                "Unset barrier bit for active list ID %d\n", rl_idx);
+        regdQList[rl_idx]->setBarrierBit(false);
+        // if pending kernels in the queue after this kernel, reschedule
+        if (regdQList[rl_idx]->dispPending()) {
+            DPRINTF(HSAPacketProcessor,
+                    "Rescheduling active list ID %d after unsetting barrier "
+                    "bit\n", rl_idx);
+            schedAQLProcessing(rl_idx);
+        }
+    }
+
+    // If set, then blocked schedule, so need to reschedule
     if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
         updateReadIndex(0, rl_idx);
     DPRINTF(HSAPacketProcessor,
index 206d9ab8485ddd5d89e509304780d55b77bfbd0d..3ff7ad27f3a5a42a9ece74f9aa60a1098e5b3476 100644 (file)
@@ -168,11 +168,13 @@ class AQLRingBuffer
 typedef struct QueueContext {
     HSAQueueDescriptor* qDesc;
     AQLRingBuffer* aqlBuf;
+    // used for HSA packets that enforce synchronization with barrier bit
+    bool barrierBit;
     QueueContext(HSAQueueDescriptor* q_desc,
                  AQLRingBuffer* aql_buf)
-                 : qDesc(q_desc), aqlBuf(aql_buf)
+                 : qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
     {}
-    QueueContext() : qDesc(NULL), aqlBuf(NULL) {}
+    QueueContext() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
 } QCntxt;
 
 class HSAPacketProcessor: public DmaDevice
@@ -233,6 +235,8 @@ class HSAPacketProcessor: public DmaDevice
         bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
         SignalState depSignalRdState;
         QueueProcessEvent aqlProcessEvent;
+        void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
+        bool getBarrierBit() const { return qCntxt.barrierBit; }
     };
     // Keeps track of queueDescriptors of registered queues
     std::vector<class RQLEntry *> regdQList;