dev-hsa,gpu-compute: Agent Packet handler implemented.

author Daniel Gerzhoy <daniel.gerzhoy@gmail.com>

Wed, 4 Nov 2020 16:51:46 +0000 (11:51 -0500)

committer Daniel Gerzhoy <daniel.gerzhoy@gmail.com>

Mon, 16 Nov 2020 16:12:48 +0000 (16:12 +0000)
author Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
Wed, 4 Nov 2020 16:51:46 +0000 (11:51 -0500)
committer Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
Mon, 16 Nov 2020 16:12:48 +0000 (16:12 +0000)
diff --git a/src/dev/hsa/hsa_device.hh b/src/dev/hsa/hsa_device.hh

index 7e8f1b7bda03fd9f09a69f2e3ee2ab44bd71e773..68cbd8255a6b3785e8c144d63752e5ffbdcfd99c 100644 (file)
--- a/src/dev/hsa/hsa_device.hh
+++ b/src/dev/hsa/hsa_device.hh
@@ -56,6 +56,18 @@ class HSADevice : public DmaDevice
  
      HSAPacketProcessor& hsaPacketProc();
  
+    /**
+     * submitAgentDispatchPkt() accepts AQL dispatch packets from the HSA
+     * packet processor. Not all devices will accept AQL dispatch packets,
+     * so the default implementation will fatal.
+     * Implementation added to steal kernel signals.
+     */
+    virtual void
+    submitAgentDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
+    {
+        fatal("%s does not accept dispatch packets\n", name());
+    }
+
      /**
       * submitDispatchPkt() accepts AQL dispatch packets from the HSA packet
       * processor. Not all devices will accept AQL dispatch packets, so the
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc

index 91d24e5f1c5d6fdddc9b2285ebd258314de6fcbf..bba3c5940cb815c8369c1cfc3c18d48834374e7e 100644 (file)
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -432,6 +432,14 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
          fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
      } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
          fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
+    } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
+        DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
+                " active list ID = %d\n", __FUNCTION__, rl_idx);
+        // Submit packet to HSA device (dispatcher)
+        hsa_device->submitAgentDispatchPkt(
+                (void *)disp_pkt, rl_idx, host_pkt_addr);
+        is_submitted = UNBLOCKED;
+        sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
      } else {
          fatal("Unsupported packet type %d\n", pkt_type);
      }
@@ -700,3 +708,56 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
                                          // multi-process support
      }
  }
+
+void
+HSAPacketProcessor::sendAgentDispatchCompletionSignal(
+    void *pkt, hsa_signal_value_t signal)
+{
+    auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
+    uint64_t signal_addr =
+            (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
+    DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
+            " completion signal: %x!\n", signal_addr);
+    /**
+     * HACK: The semantics of the HSA signal is to
+     * decrement the current signal value.
+     * I'm going to cheat here and read out
+     * the value from main memory using functional
+     * access, and then just DMA the decremented value.
+     * The reason for this is that the DMASequencer does
+     * not support atomic operations.
+     */
+    VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
+
+    DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
+            (uint64_t)sys->threads[0]->cpuId());
+
+
+    hsa_signal_value_t *new_signal = new hsa_signal_value_t;
+    *new_signal = (hsa_signal_value_t) *prev_signal - 1;
+
+    dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
+}
+
+void
+HSAPacketProcessor::sendCompletionSignal(hsa_signal_value_t signal)
+{
+    uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
+    DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
+            signal_addr);
+    /**
+     * HACK: The semantics of the HSA signal is to
+     * decrement the current signal value.
+     * I'm going to cheat here and read out
+     * the value from main memory using functional
+     * access, and then just DMA the decremented value.
+     * The reason for this is that the DMASequencer does
+     * not support atomic operations.
+     */
+    VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
+
+    hsa_signal_value_t *new_signal = new hsa_signal_value_t;
+    *new_signal = (hsa_signal_value_t) *prev_signal - 1;
+
+    dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
+}
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh

index 27df90a85124b5eac89042f271c51ab66f21512f..0f82e925320709a6b4c101dbd03119f4646886bc 100644 (file)
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -329,6 +329,10 @@ class HSAPacketProcessor: public DmaDevice
      void schedAQLProcessing(uint32_t rl_idx);
      void schedAQLProcessing(uint32_t rl_idx, Tick delay);
  
+    void sendAgentDispatchCompletionSignal(void *pkt,
+                                           hsa_signal_value_t signal);
+    void sendCompletionSignal(hsa_signal_value_t signal);
+
      class DepSignalsReadDmaEvent : public Event
      {
        protected:
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript

index 0f1afbcca35aa8f042996642c4d9bbb03207abde..416b9e9242fb38476cec9f36673f00ff01a888ec 100644 (file)
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -71,6 +71,7 @@ Source('tlb_coalescer.cc')
  Source('vector_register_file.cc')
  Source('wavefront.cc')
  
+DebugFlag('GPUAgentDisp')
  DebugFlag('GPUCoalescer')
  DebugFlag('GPUCommandProc')
  DebugFlag('GPUDriver')
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc

index 17c74f0ecf2fcd32c4f438e1e3570fd9444d0230..a4fe92385c490cb7fc5fc8792a225a7a9cab0435 100644 (file)
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -34,6 +34,7 @@
  
  #include "gpu-compute/dispatcher.hh"
  
+#include "debug/GPUAgentDisp.hh"
  #include "debug/GPUDisp.hh"
  #include "debug/GPUKernelInfo.hh"
  #include "debug/GPUWgLatency.hh"
@@ -130,6 +131,8 @@ GPUDispatcher::dispatch(HSAQueueEntry *task)
  
      DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
              task->kernelName(), task->dispatchId());
+    DPRINTF(GPUAgentDisp, "launching kernel: %s, dispatch ID: %d\n",
+            task->kernelName(), task->dispatchId());
  
      execIds.push(task->dispatchId());
      dispatchActive = true;
@@ -144,6 +147,7 @@ void
  GPUDispatcher::exec()
  {
      int fail_count(0);
+    int disp_count(0);
  
      /**
       * There are potentially multiple outstanding kernel launches.
@@ -151,6 +155,7 @@ GPUDispatcher::exec()
       * can fit on the GPU even if another kernel's workgroups cannot
       */
      DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+    DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
  
      if (execIds.size() > 0) {
          ++cyclesWaitingForDispatch;
@@ -204,7 +209,7 @@ GPUDispatcher::exec()
                  /**
                   * if we failed try the next kernel,
                   * it may have smaller workgroups.
-                 * put it on the queue to rety latter
+                 * put it on the queue to retry later
                   */
                  DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
                  execIds.push(exec_id);
@@ -212,6 +217,7 @@ GPUDispatcher::exec()
                  break;
              } else if (!launched) {
                  launched = true;
+                disp_count++;
                  DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
              }
          }
@@ -221,6 +227,8 @@ GPUDispatcher::exec()
      }
  
      DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+    DPRINTF(GPUWgLatency, "Kernel Wgs dispatched: %d | %d failures\n",
+            disp_count, fail_count);
  
      while (doneIds.size()) {
          DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc

index 0b7f2fa66646e66ffdbadca201ec073104301f99..a8c790ab511b8e859e7665c60fada1bcba987d1b 100644 (file)
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -93,6 +93,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
      DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
          "kernel object\n", akc.kernel_code_entry_byte_offset);
  
+    DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
+        (uint64_t)tc->cpuId());
+
+
      Addr machine_code_addr = (Addr)disp_pkt->kernel_object
          + akc.kernel_code_entry_byte_offset;
  
@@ -166,6 +170,54 @@ GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
      hsaPP->finishPkt(raw_pkt, queue_id);
  }
  
+/**
+ * submitAgentDispatchPkt() is for accepting agent dispatch packets.
+ * These packets will control the dispatch of Wg on the device, and inform
+ * the host when a specified number of Wg have been executed on the device.
+ *
+ * For now it simply finishes the pkt.
+ */
+void
+GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
+    Addr host_pkt_addr)
+{
+    //Parse the Packet, see what it wants us to do
+    _hsa_agent_dispatch_packet_t * agent_pkt =
+        (_hsa_agent_dispatch_packet_t *)raw_pkt;
+
+    if (agent_pkt->type == AgentCmd::Nop) {
+        DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
+    } else if (agent_pkt->type == AgentCmd::Steal) {
+        //This is where we steal the HSA Task's completion signal
+        int kid = agent_pkt->arg[0];
+        DPRINTF(GPUCommandProc,
+            "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
+            kid);
+
+        HSAQueueEntry *task = dispatcher.hsaTask(kid);
+        uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
+
+        uint64_t return_address = agent_pkt->return_address;
+        DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
+        //*return_address = signal_addr;
+        Addr *new_signal_addr = new Addr;
+        *new_signal_addr  = (Addr)signal_addr;
+        dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
+
+        DPRINTF(GPUCommandProc,
+            "Agent Dispatch Packet Stealing signal handle from kid %d :" \
+            "(%x:%x) writing into %x\n",
+            kid,signal_addr,new_signal_addr,return_address);
+
+    } else
+    {
+        panic("The agent dispatch packet provided an unknown argument in" \
+        "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
+    }
+
+    hsaPP->finishPkt(raw_pkt, queue_id);
+}
+
  /**
   * Once the CP has finished extracting all relevant information about
   * a task and has initialized the ABI state, we send a description of
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh

index d38ee1f0b507b475bc3f07c9502c727b9e7d333e..071bd89c50d373f10027f9b7da1336023073b2fa 100644 (file)
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -65,6 +65,13 @@ class GPUCommandProcessor : public HSADevice
      void setShader(Shader *shader);
      Shader* shader();
  
+    enum AgentCmd {
+      Nop = 0,
+      Steal = 1
+    };
+
+    void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                           Addr host_pkt_addr) override;
      void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                             Addr host_pkt_addr) override;
      void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index 365edd44bf2a20609bd8e40f100702cfcece88fb..012b9870cd3dcc5b9dc34cabe5c50f383ef68552 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -38,6 +38,7 @@
  #include "arch/x86/isa_traits.hh"
  #include "arch/x86/linux/linux.hh"
  #include "base/chunk_generator.hh"
+#include "debug/GPUAgentDisp.hh"
  #include "debug/GPUDisp.hh"
  #include "debug/GPUMem.hh"
  #include "debug/GPUShader.hh"
@@ -231,6 +232,7 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
      bool scheduledSomething = false;
      int cuCount = 0;
      int curCu = nextSchedCu;
+    int disp_count(0);
  
      while (cuCount < n_cu) {
          //Every time we try a CU, update nextSchedCu
@@ -245,6 +247,8 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
              scheduledSomething = true;
              DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
                              curCu, task->globalWgId());
+            DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
+                            curCu, task->globalWgId());
              DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
                      curTick(), task->globalWgId(), curCu);
  
@@ -259,12 +263,15 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
              cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
  
              task->markWgDispatch();
+            ++disp_count;
          }
  
          ++cuCount;
          curCu = nextSchedCu;
      }
  
+     DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
+
      return scheduledSomething;
  }
author	Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
	Wed, 4 Nov 2020 16:51:46 +0000 (11:51 -0500)
committer	Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
	Mon, 16 Nov 2020 16:12:48 +0000 (16:12 +0000)
src/dev/hsa/hsa_device.hh		patch \| blob \| history
src/dev/hsa/hsa_packet_processor.cc		patch \| blob \| history
src/dev/hsa/hsa_packet_processor.hh		patch \| blob \| history
src/gpu-compute/SConscript		patch \| blob \| history
src/gpu-compute/dispatcher.cc		patch \| blob \| history
src/gpu-compute/gpu_command_processor.cc		patch \| blob \| history
src/gpu-compute/gpu_command_processor.hh		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history