From 965ad12b9a4ae4035b0f63e7ab083ac87258a071 Mon Sep 17 00:00:00 2001
From: Sooraj Puthoor <Sooraj.Puthoor@amd.com>
Date: Sun, 11 Feb 2018 03:05:00 -0500
Subject: [PATCH] dev-hsa: enable interruptible hsa signal support

Event creation and management support from emulated drivers is required
to support interruptible signals in HSA and this support was not
available. This changeset adds the event creation and management support
in the emulated driver.  With this patch, each interruptible signal
created by the HSA runtime is associated with a signal event. The HSA
runtime can then put a thread waiting on a signal condition to sleep
asking the driver to monitor the event associated with that signal. If
the signal is modified by the GPU, the dispatcher notifies the driver
about signal value change.  If the modifier is a CPU thread, the thread
will have to make HSA API calls to modify the signal and these API calls
will notify the driver about signal value change. Once the driver is
notified about a change in the signal value, the driver checks to see if
any thread is sleeping on that signal and wake up the sleeping thread
associated with that event. The driver has also implemented the time_out
wakeup that can wake up the thread after a certain time period has
expired. This is also true for barrier packets.

Each signal has an event address in a kernel managed and allocated
event page that can be used as a mailbox pointer to notify an event.
However, this feature used by non-CPU agents to communicate with the
driver is not implemented by this changeset because the non-CPU HSA
agents in our model can directly communicate with driver in our
implementation. Having said that, adding that feature should be trivial
because the event address and event pages are correctly setup by this
changeset and just adding the event page's virtual address to our PIO
doorbell interface in the page tables and registering that pio address
to the driver should be sufficient. Managing mailbox pointer for an
event is based on event ID and using this event ID as an index into
event page, this changeset already provides a unique mailbox pointer for
each event.

Change-Id: Ic62794076ddd47526b1f952fdb4c1bad632bdd2e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/38335
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 configs/example/apu_se.py                |   2 +-
 src/dev/hsa/hsa_device.hh                |  19 +++-
 src/dev/hsa/hsa_driver.cc                | 109 ++++++++++++++++---
 src/dev/hsa/hsa_driver.hh                |  83 ++++++++++++++-
 src/dev/hsa/hsa_packet_processor.cc      |  35 +++---
 src/dev/hsa/hsa_packet_processor.hh      |   7 ++
 src/dev/hsa/hsa_signal.hh                |  65 ++++++++++++
 src/dev/hsa/hw_scheduler.cc              |   3 +-
 src/dev/hsa/kfd_event_defines.h          |  45 ++++++++
 src/gpu-compute/dispatcher.cc            |  36 +++----
 src/gpu-compute/gpu_command_processor.cc |  60 +++++++++++
 src/gpu-compute/gpu_command_processor.hh |  24 +++++
 src/gpu-compute/gpu_compute_driver.cc    | 130 +++++++++++++++++++++--
 src/gpu-compute/gpu_compute_driver.hh    |   1 +
 src/sim/emul_driver.hh                   |   2 +-
 15 files changed, 548 insertions(+), 73 deletions(-)
 create mode 100644 src/dev/hsa/hsa_signal.hh
 create mode 100644 src/dev/hsa/kfd_event_defines.h

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 7edc733fb..feed8a724 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -470,7 +470,7 @@ else:
                "/usr/lib/x86_64-linux-gnu"
            ]),
            'HOME=%s' % os.getenv('HOME','/'),
-           "HSA_ENABLE_INTERRUPT=0"]
+           "HSA_ENABLE_INTERRUPT=1"]
 
 process = Process(executable = executable, cmd = [options.cmd]
                   + options.options.split(), drivers = [gpu_driver], env = env)
diff --git a/src/dev/hsa/hsa_device.hh b/src/dev/hsa/hsa_device.hh
index 68cbd8255..6f981d669 100644
--- a/src/dev/hsa/hsa_device.hh
+++ b/src/dev/hsa/hsa_device.hh
@@ -43,10 +43,13 @@
 #include "dev/hsa/hsa_packet_processor.hh"
 #include "params/HSADevice.hh"
 
+class HSADriver;
+
 class HSADevice : public DmaDevice
 {
   public:
     typedef HSADeviceParams Params;
+    typedef std::function<void(const uint64_t &)> HsaSignalCallbackFunction;
 
     HSADevice(const Params &p) : DmaDevice(p), hsaPP(p.hsapp)
     {
@@ -92,7 +95,21 @@ class HSADevice : public DmaDevice
     {
         fatal("%s does not accept vendor specific packets\n", name());
     }
-
+    virtual void
+    attachDriver(HSADriver *driver)
+    {
+        fatal("%s does not need HSA driver\n", name());
+    }
+    virtual void
+    updateHsaSignal(Addr signal_handle, uint64_t signal_value)
+    {
+        fatal("%s does not have HSA signal update functionality.\n", name());
+    }
+    virtual uint64_t
+    functionalReadHsaSignal(Addr signal_handle)
+    {
+        fatal("%s does not have HSA signal read functionality.\n", name());
+    }
     void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
                      void *data, Tick delay = 0);
     void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *cb,
diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc
index 190213a8b..4cd5d1144 100644
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -39,6 +39,8 @@
 #include "cpu/thread_context.hh"
 #include "debug/HSADriver.hh"
 #include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/HSADriver.hh"
 #include "sim/process.hh"
@@ -65,32 +67,56 @@ HSADriver::open(ThreadContext *tc, int mode, int flags)
 
 /**
  * Currently, mmap() will simply setup a mapping for the associated
- * device's packet processor's doorbells.
+ * device's packet processor's doorbells and creates the event page.
  */
 Addr
 HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
-                int tgt_flags, int tgt_fd, int offset)
+                int tgt_flags, int tgt_fd, off_t offset)
 {
-    DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
-            "offset: 0x%x)\n", start, length, offset);
-
-    auto process = tc->getProcessPtr();
-    auto mem_state = process->memState;
+     // Is this a signal event mmap
+     bool is_event_mmap = false;
+     // If addr == 0, then we may need to do mmap.
+     bool should_mmap = (start == 0);
+     auto process = tc->getProcessPtr();
+     auto mem_state = process->memState;
+     // Check if mmap is for signal events first
+     if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
+         KFD_MMAP_TYPE_EVENTS) {
+         is_event_mmap = true;
+         DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
+                 "offset: 0x%x,  )\n", start, length, offset);
+         panic_if(start != 0,
+                  "Start address should be provided by KFD\n");
+         panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+                  "Requested length %d, expected length %d; length mismatch\n",
+                   length, 8 * KFD_SIGNAL_EVENT_LIMIT);
+         // For signal event, do mmap only is eventPage is uninitialized
+         should_mmap = (!eventPage);
+    } else {
+        DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
+                "offset: 0x%x)\n", start, length, offset);
+    }
 
     // Extend global mmap region if necessary.
-    if (start == 0) {
-        // Assume mmap grows down, as in x86 Linux.
+    if (should_mmap) {
+        // Assume mmap grows down, as in x86 Linux
         start = mem_state->getMmapEnd() - length;
         mem_state->setMmapEnd(start);
     }
 
-    /**
-     * Now map this virtual address to our PIO doorbell interface
-     * in the page tables (non-cacheable).
-     */
-    process->pTable->map(start, device->hsaPacketProc().pioAddr,
-                         length, false);
-    DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
+    if (is_event_mmap) {
+         if (should_mmap) {
+             eventPage = start;
+         }
+    } else {
+        // Now map this virtual address to our PIO doorbell interface
+        // in the page tables (non-cacheable)
+        process->pTable->map(start, device->hsaPacketProc().pioAddr,
+                             length, false);
+
+        DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
+    }
+
     return start;
 }
 
@@ -116,3 +142,54 @@ HSADriver::allocateQueue(ThreadContext *tc, Addr ioc_buf)
                               args->ring_base_address, args->queue_id,
                               args->ring_size);
 }
+
+const char*
+HSADriver::DriverWakeupEvent::description() const
+{
+    return "DriverWakeupEvent";
+}
+
+void
+HSADriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
+{
+    assert(driver);
+    driver->schedule(this, curTick() + wakeup_delay);
+}
+
+void
+HSADriver::signalWakeupEvent(uint32_t event_id)
+{
+    panic_if(event_id >= eventSlotIndex,
+        "Trying wakeup on an event that is not yet created\n");
+    if (ETable[event_id].threadWaiting) {
+        panic_if(!ETable[event_id].tc,
+                 "No thread context to wake up\n");
+        ThreadContext *tc = ETable[event_id].tc;
+        DPRINTF(HSADriver,
+                "Signal event: Waking up CPU %d\n", tc->cpuId());
+        // Wake up this thread
+        tc->activate();
+        // Remove events that can wake up this thread
+        TCEvents[tc].clearEvents();
+    } else {
+       // This may be a race condition between an ioctl call asking to wait on
+       // this event and this signalWakeupEvent. Taking care of this race
+       // condition here by setting the event here. The ioctl call should take
+       // the necessary action when waiting on an already set event.  However,
+       // this may be a genuine instance in which the runtime has decided not
+       // to wait on this event. But since we cannot distinguish this case with
+       // the race condition, we are any way setting the event.
+       ETable[event_id].setEvent = true;
+    }
+}
+
+void
+HSADriver::DriverWakeupEvent::process()
+{
+    DPRINTF(HSADriver,
+            "Timer event: Waking up CPU %d\n", tc->cpuId());
+    // Wake up this thread
+    tc->activate();
+    // Remove events that can wake up this thread
+    driver->TCEvents[tc].clearEvents();
+}
diff --git a/src/dev/hsa/hsa_driver.hh b/src/dev/hsa/hsa_driver.hh
index 429deddca..fc8131e0b 100644
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -51,6 +51,8 @@
 #ifndef __DEV_HSA_HSA_DRIVER_HH__
 #define __DEV_HSA_HSA_DRIVER_HH__
 
+#include <unordered_map>
+
 #include "base/types.hh"
 #include "sim/emul_driver.hh"
 
@@ -66,8 +68,87 @@ class HSADriver : public EmulatedDriver
 
     int open(ThreadContext *tc, int mode, int flags);
     Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
-              int prot, int tgtFlags, int tgtFd, int offset);
+              int prot, int tgt_flags, int tgt_fd, off_t offset);
+    virtual void signalWakeupEvent(uint32_t event_id);
+    class DriverWakeupEvent : public Event
+    {
+      public:
+        DriverWakeupEvent(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
+            : driver(hsa_driver), tc(thrd_cntxt)  {}
+        void process() override;
+        const char *description() const override;
+        void scheduleWakeup(Tick wakeup_delay);
+      private:
+        HSADriver *driver;
+        ThreadContext *tc;
+    };
+    class EventTableEntry {
+      public:
+        EventTableEntry() :
+            mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
+        {}
+        // Mail box pointer for this address. Current implementation does not
+        // use this mailBoxPtr to notify events but directly calls
+        // signalWakeupEvent from dispatcher (GPU) to notify event. So,
+        // currently this mailBoxPtr is not used. But a future implementation
+        // may communicate to the driver using mailBoxPtr.
+        Addr mailBoxPtr;
+        // Thread context waiting on this event. We do not support multiple
+        // threads waiting on an event currently.
+        ThreadContext *tc;
+        // threadWaiting = true, if some thread context is waiting on this
+        // event.  A thread context waiting on this event is put to sleep.
+        bool threadWaiting;
+        // setEvent = true, if this event is triggered but when this event
+        // triggered, no thread context was waiting on it. In the future, some
+        // thread context will try to wait on this event but since event has
+        // already happened, we will not allow that thread context to go to
+        // sleep. The above mentioned scenario can happen when the waiting
+        // thread and wakeup thread race on this event and the wakeup thread
+        // beat the waiting thread at the driver.
+        bool setEvent;
+    };
+    typedef class EventTableEntry ETEntry;
+
   protected:
+    Addr eventPage;
+    uint32_t eventSlotIndex;
+    // Event table that keeps track of events. It is indexed with event ID.
+    std::unordered_map<uint32_t, ETEntry> ETable;
+
+    // TCEvents map keeps track of the events that can wakeup this thread. When
+    // multiple events can wake up this thread, this data structure helps to
+    // reset all events when one of those events wake up this thread. The
+    // signal events that can wake up this thread are stored in signalEvents
+    // whereas the timer wakeup event is stored in timerEvent.
+    class EventList {
+      public:
+        EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
+        EventList(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
+            : driver(hsa_driver), timerEvent(hsa_driver, thrd_cntxt)
+        { }
+        void clearEvents() {
+            assert(driver);
+            for (auto event : signalEvents) {
+               assert(event < driver->eventSlotIndex);
+               panic_if(driver->ETable[event].tc->status() == \
+                            ThreadContext::Suspended,
+                        "Thread should not be suspended\n");
+               driver->ETable[event].tc = nullptr;
+               driver->ETable[event].threadWaiting = false;
+            }
+            signalEvents.clear();
+            if (timerEvent.scheduled()) {
+                driver->deschedule(timerEvent);
+            }
+        }
+        HSADriver *driver;
+        DriverWakeupEvent timerEvent;
+        // The set of events that can wake up the same thread.
+        std::set<uint32_t> signalEvents;
+    };
+    std::unordered_map<ThreadContext *, EventList> TCEvents;
+
     /**
      * HSA agent (device) that is controled by this driver.
      */
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index 756757a8f..4110f9616 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -401,29 +401,20 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
             dep_sgnl_rd_st->resetSigVals();
             // The completion signal is connected
             if (bar_and_pkt->completion_signal != 0) {
-                // The signal value is aligned 8 bytes
-                // from the actual handle in the runtime
-                uint64_t signal_addr =
-                    (uint64_t) (((uint64_t *)
-                    bar_and_pkt->completion_signal) + 1);
+                // HACK: The semantics of the HSA signal is to
+                // decrement the current signal value
+                // I'm going to cheat here and read out
+                // the value from main memory using functional
+                // access, and then just DMA the decremented value.
+                uint64_t signal_value = hsa_device->functionalReadHsaSignal(\
+                                            bar_and_pkt->completion_signal);
+
                 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
-                       " completion signal: %x!\n", signal_addr);
-                /**
-                 * HACK: The semantics of the HSA signal is to
-                 * decrement the current signal value.
-                 * I'm going to cheat here and read out
-                 * the value from main memory using functional
-                 * access, and then just DMA the decremented value.
-                 * The reason for this is that the DMASequencer does
-                 * not support atomic operations.
-                 */
-                VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
-
-                hsa_signal_value_t *new_signal = new hsa_signal_value_t;
-                *new_signal = (hsa_signal_value_t)*prev_signal - 1;
-
-                dmaWriteVirt(signal_addr,
-                             sizeof(hsa_signal_value_t), NULL, new_signal, 0);
+                       " completion signal! Addr: %x\n",
+                       bar_and_pkt->completion_signal);
+
+                hsa_device->updateHsaSignal(bar_and_pkt->completion_signal,
+                                            signal_value - 1);
             }
         }
         if (dep_sgnl_rd_st->pendingReads > 0) {
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index ed0fdff08..43c1c0c0e 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -304,6 +304,13 @@ class HSAPacketProcessor: public DmaDevice
         return regdQList.at(queId);
     }
 
+    uint64_t
+    inFlightPkts(uint32_t queId)
+    {
+        auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
+        return aqlBuf->dispIdx() - aqlBuf->rdIdx();
+    }
+
     int numHWQueues;
     Addr pioAddr;
     Addr pioSize;
diff --git a/src/dev/hsa/hsa_signal.hh b/src/dev/hsa/hsa_signal.hh
new file mode 100644
index 000000000..a1c5e8374
--- /dev/null
+++ b/src/dev/hsa/hsa_signal.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016-2019 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef DEV_HSA_HSA_SIGNAL_H
+#define DEV_HSA_HSA_SIGNAL_H
+
+// AMD Signal Kind Enumeration Values.
+typedef int64_t amd_signal_kind64_t;
+enum amd_signal_kind_t {
+  AMD_SIGNAL_KIND_INVALID = 0,
+  AMD_SIGNAL_KIND_USER = 1,
+  AMD_SIGNAL_KIND_DOORBELL = -1,
+  AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
+};
+
+// AMD Signal.
+typedef struct amd_signal_s {
+  amd_signal_kind64_t kind;
+  union {
+    volatile int64_t value;
+    volatile uint32_t* legacy_hardware_doorbell_ptr;
+    volatile uint64_t* hardware_doorbell_ptr;
+  };
+  uint64_t event_mailbox_ptr;
+  uint32_t event_id;
+  uint32_t reserved1;
+  uint64_t start_ts;
+  uint64_t end_ts;
+  union {
+    uint64_t queue_ptr;
+    uint64_t reserved2;
+  };
+  uint32_t reserved3[2];
+} amd_signal_t;
+
+#endif // DEV_HSA_HSA_SIGNAL_H
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index 7d8fb9099..5e2de6957 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -300,7 +300,6 @@ HWScheduler::isRLQIdle(uint32_t rl_idx)
     DPRINTF(HSAPacketProcessor,
             "@ %s, analyzing hw queue %d\n", __FUNCTION__, rl_idx);
     HSAQueueDescriptor* qDesc = hsaPP->getRegdListEntry(rl_idx)->qCntxt.qDesc;
-    AQLRingBuffer* aql_buf = hsaPP->getRegdListEntry(rl_idx)->qCntxt.aqlBuf;
 
     // If there a pending DMA to this registered queue
     // then the queue is not idle
@@ -311,7 +310,7 @@ HWScheduler::isRLQIdle(uint32_t rl_idx)
     // Since packet completion stage happens only after kernel completion
     // we need to keep the queue mapped till all the outstanding kernels
     // from that queue are finished
-    if (aql_buf->rdIdx() != aql_buf->dispIdx()) {
+    if (hsaPP->inFlightPkts(rl_idx)) {
         return false;
     }
 
diff --git a/src/dev/hsa/kfd_event_defines.h b/src/dev/hsa/kfd_event_defines.h
new file mode 100644
index 000000000..0202b3b2e
--- /dev/null
+++ b/src/dev/hsa/kfd_event_defines.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016-2019 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef KFD_EVENT_DEFINES_H_INCLUDED
+#define KFD_EVENT_DEFINES_H_INCLUDED
+
+#include "dev/hsa/kfd_ioctl.h"
+
+#define PAGE_SHIFT 12
+#define KFD_MMAP_TYPE_SHIFT     (62 - PAGE_SHIFT)
+#define KFD_MMAP_TYPE_MASK      (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_DOORBELL  (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_EVENTS    (0x2ULL << KFD_MMAP_TYPE_SHIFT)
+#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index dae7b8c12..26e903511 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -307,30 +307,20 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
         gpuCmdProc->hsaPacketProc()
             .finishPkt(task->dispPktPtr(), task->queueId());
         if (task->completionSignal()) {
-            // The signal value is aligned 8 bytes from
-            // the actual handle in the runtime
-            Addr signal_addr = task->completionSignal() + sizeof(Addr);
-            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
-                    "completion signal: %x!\n", signal_addr);
-
             /**
-             * HACK: The semantics of the HSA signal is to decrement
-             * the current signal value. We cheat here and read out
-             * he value from main memory using functional access and
-             * then just DMA the decremented value. This is because
-             * the DMA controller does not currently support GPU
-             * atomics.
-             */
-            auto *tc = gpuCmdProc->system()->threads[0];
-            auto &virt_proxy = tc->getVirtProxy();
-            TypedBufferArg<Addr> prev_signal(signal_addr);
-            prev_signal.copyIn(virt_proxy);
-
-            Addr *new_signal = new Addr;
-            *new_signal = (Addr)*prev_signal - 1;
-
-            gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
-                new_signal, 0);
+            * HACK: The semantics of the HSA signal is to decrement
+            * the current signal value. We cheat here and read out
+            * he value from main memory using functional access and
+            * then just DMA the decremented value.
+            */
+            uint64_t signal_value =
+                gpuCmdProc->functionalReadHsaSignal(task->completionSignal());
+
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
+
+            gpuCmdProc->updateHsaSignal(task->completionSignal(),
+                                        signal_value - 1);
         } else {
             DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
                 "signal\n");
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index a8c790ab5..245cf09f3 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -39,6 +39,9 @@
 #include "debug/GPUKernelInfo.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "params/GPUCommandProcessor.hh"
+#include "sim/process.hh"
+#include "sim/proxy_ptr.hh"
+#include "sim/syscall_emul_buf.hh"
 
 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
     : HSADevice(p), dispatcher(*p.dispatcher)
@@ -146,6 +149,57 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     ++dynamic_task_id;
 }
 
+uint64_t
+GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
+{
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    auto tc = system()->threads[0];
+    ConstVPtr<Addr> prev_value(value_addr, tc);
+    return *prev_value;
+}
+
+void
+GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
+{
+    // The signal value is aligned 8 bytes from
+    // the actual handle in the runtime
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+    Addr event_addr = getHsaSignalEventAddr(signal_handle);
+    DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
+
+    Addr *new_signal = new Addr;
+    *new_signal = signal_value;
+
+    dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
+
+    auto tc = system()->threads[0];
+    ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
+
+    // Notifying an event with its mailbox pointer is
+    // not supported in the current implementation. Just use
+    // mailbox pointer to distinguish between interruptible
+    // and default signal. Interruptible signal will have
+    // a valid mailbox pointer.
+    if (*mailbox_ptr != 0) {
+        // This is an interruptible signal. Now, read the
+        // event ID and directly communicate with the driver
+        // about that event notification.
+        ConstVPtr<uint32_t> event_val(event_addr, tc);
+
+        DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
+                "signal event value %d\n", *event_val);
+        signalWakeupEvent(*event_val);
+    }
+}
+
+void
+GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
+{
+    fatal_if(driver, "Should not overwrite driver.");
+    driver = hsa_driver;
+}
+
 /**
  * submitVendorPkt() is for accepting vendor-specific packets from
  * the HSAPP. Vendor-specific packets may be used by the runtime to
@@ -230,6 +284,12 @@ GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
     dispatcher.dispatch(task);
 }
 
+void
+GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
+{
+    driver->signalWakeupEvent(event_id);
+}
+
 /**
  * The CP is responsible for traversing all HSA-ABI-related data
  * structures from memory and initializing the ABI state.
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 071bd89c5..f067999b1 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -48,6 +48,8 @@
 #define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
 
 #include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_signal.hh"
+#include "gpu-compute/gpu_compute_driver.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 
 struct GPUCommandProcessorParams;
@@ -76,16 +78,38 @@ class GPUCommandProcessor : public HSADevice
                            Addr host_pkt_addr) override;
     void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
                          Addr host_pkt_addr) override;
+    void attachDriver(HSADriver *driver) override;
     void dispatchPkt(HSAQueueEntry *task);
+    void signalWakeupEvent(uint32_t event_id);
 
     Tick write(PacketPtr pkt) override { return 0; }
     Tick read(PacketPtr pkt) override { return 0; }
     AddrRangeList getAddrRanges() const override;
     System *system();
 
+    void updateHsaSignal(Addr signal_handle, uint64_t signal_value) override;
+
+    uint64_t functionalReadHsaSignal(Addr signal_handle);
+
+    Addr getHsaSignalValueAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, value);
+    }
+
+    Addr getHsaSignalMailboxAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, event_mailbox_ptr);
+    }
+
+    Addr getHsaSignalEventAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, event_id);
+    }
+
   private:
     Shader *_shader;
     GPUDispatcher &dispatcher;
+    HSADriver *driver;
 
     void initABI(HSAQueueEntry *task);
 
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index a840bb28a..fbc0ffa4f 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -40,6 +40,7 @@
 #include "debug/GPUDriver.hh"
 #include "dev/hsa/hsa_device.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/GPUComputeDriver.hh"
 #include "sim/syscall_emul_buf.hh"
@@ -47,6 +48,7 @@
 GPUComputeDriver::GPUComputeDriver(const Params &p)
     : HSADriver(p)
 {
+    device->attachDriver(this);
     DPRINTF(GPUDriver, "Constructing KFD: device\n");
 }
 
@@ -61,8 +63,8 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
 
             TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
-            args->major_version = 1;
-            args->minor_version = 0;
+            args->major_version = KFD_IOCTL_MAJOR_VERSION;
+            args->minor_version = KFD_IOCTL_MINOR_VERSION;
 
             args.copyOut(virt_proxy);
           }
@@ -205,17 +207,59 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
           break;
         case AMDKFD_IOC_CREATE_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+
+            TypedBufferArg<kfd_ioctl_create_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
+                fatal("Signal events are only supported currently\n");
+            } else if (eventSlotIndex == SLOTS_PER_PAGE) {
+                fatal("Signal event wasn't created; signal limit reached\n");
+            }
+            // Currently, we allocate only one signal_page for events.
+            // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
+            uint64_t page_index = 0;
+            args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
+            args->event_page_offset <<= PAGE_SHIFT;
+            // TODO: Currently we support only signal events, hence using
+            // the same ID for both signal slot and event slot
+            args->event_slot_index = eventSlotIndex;
+            args->event_id = eventSlotIndex++;
+            args->event_trigger_data = args->event_id;
+            DPRINTF(GPUDriver, "amdkfd create events"
+                    "(event_id: 0x%x, offset: 0x%x)\n",
+                    args->event_id, args->event_page_offset);
+            // Since eventSlotIndex is increased everytime a new event is
+            // created ETable at eventSlotIndex(event_id) is guaranteed to be
+            // empty. In a future implementation that reuses deleted event_ids,
+            // we should check if event table at this
+            // eventSlotIndex(event_id) is empty before inserting a new event
+            // table entry
+            ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
+            args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_DESTROY_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+            TypedBufferArg<kfd_ioctl_destroy_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
+            fatal_if(ETable.count(args->event_id) == 0,
+                     "Event ID invalid, cannot destroy this event\n");
+            ETable.erase(args->event_id);
           }
           break;
         case AMDKFD_IOC_SET_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
+            TypedBufferArg<kfd_ioctl_set_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
+            fatal_if(ETable.count(args->event_id) == 0,
+                     "Event ID invlaid, cannot set this event\n");
+            ETable[args->event_id].setEvent = true;
+            signalWakeupEvent(args->event_id);
           }
           break;
         case AMDKFD_IOC_RESET_EVENT:
@@ -225,7 +269,69 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
           break;
         case AMDKFD_IOC_WAIT_EVENTS:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+            TypedBufferArg<kfd_ioctl_wait_events_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            kfd_event_data *events =
+                (kfd_event_data *)args->events_ptr;
+            DPRINTF(GPUDriver, "amdkfd wait for events"
+                    "(wait on all: %d, timeout : %d, num_events: %s)\n",
+                    args->wait_for_all, args->timeout, args->num_events);
+            panic_if(args->wait_for_all != 0 && args->num_events > 1,
+                    "Wait for all events not supported\n");
+            bool should_sleep = true;
+            if (TCEvents.count(tc) == 0) {
+                // This thread context trying to wait on an event for the first
+                // time, initialize it.
+                TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
+                                 std::make_tuple(this, tc));
+                DPRINTF(GPUDriver, "\tamdkfd creating event list"
+                        " for thread  %d\n", tc->cpuId());
+            }
+            panic_if(TCEvents[tc].signalEvents.size() != 0,
+                     "There are %d events that put this thread to sleep,"
+                     " this thread should not be running\n",
+                     TCEvents[tc].signalEvents.size());
+            for (int i = 0; i < args->num_events; i++) {
+                panic_if(!events,
+                         "Event pointer invalid\n");
+                Addr eventDataAddr = (Addr)(events + i);
+                TypedBufferArg<kfd_event_data> EventData(
+                    eventDataAddr, sizeof(kfd_event_data));
+                EventData.copyIn(virt_proxy);
+                DPRINTF(GPUDriver,
+                        "\tamdkfd wait for event %d\n", EventData->event_id);
+                panic_if(ETable.count(EventData->event_id) == 0,
+                         "Event ID invalid, cannot set this event\n");
+                panic_if(ETable[EventData->event_id].threadWaiting,
+                         "Multiple threads waiting on the same event\n");
+                if (ETable[EventData->event_id].setEvent) {
+                    // If event is already set, the event has already happened.
+                    // Just unset the event and dont put this thread to sleep.
+                    ETable[EventData->event_id].setEvent = false;
+                    should_sleep = false;
+                }
+                if (should_sleep) {
+                    // Put this thread to sleep
+                    ETable[EventData->event_id].threadWaiting = true;
+                    ETable[EventData->event_id].tc = tc;
+                    TCEvents[tc].signalEvents.insert(EventData->event_id);
+                }
+            }
+
+            // TODO: Return the correct wait_result back. Currently, returning
+            // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
+            // Ideally, this needs to be done after the event is triggered and
+            // after the thread is woken up.
+            args->wait_result = 0;
+            args.copyOut(virt_proxy);
+            if (should_sleep) {
+                // Put this thread to sleep
+                sleepCPU(tc, args->timeout);
+            } else {
+                // Remove events that tried to put this thread to sleep
+                TCEvents[tc].clearEvents();
+            }
           }
           break;
         case AMDKFD_IOC_DBG_REGISTER:
@@ -375,6 +481,18 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
     return 0;
 }
 
+void
+GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
+{
+    // Convert millisecs to ticks
+    Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
+    assert(TCEvents.count(tc) == 1);
+    TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
+    tc->suspend();
+    DPRINTF(GPUDriver,
+            "CPU %d is put to sleep\n", tc->cpuId());
+}
+
 Addr
 GPUComputeDriver::gpuVmApeBase(int gpuNum) const
 {
diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh
index 53dfb748f..505391704 100644
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -55,6 +55,7 @@ class GPUComputeDriver final : public HSADriver
     typedef GPUComputeDriverParams Params;
     GPUComputeDriver(const Params &p);
     int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
+    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
 
   private:
     /**
diff --git a/src/sim/emul_driver.hh b/src/sim/emul_driver.hh
index 69d25ef61..1924b46aa 100644
--- a/src/sim/emul_driver.hh
+++ b/src/sim/emul_driver.hh
@@ -93,7 +93,7 @@ class EmulatedDriver : public SimObject
      * (see the SyscallReturn class).
      */
     virtual Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
-                      int prot, int tgtFlags, int tgtFd, int offset)
+                      int prot, int tgtFlags, int tgtFd, off_t offset)
                       { return -EBADF; }
 };
 
-- 
2.30.2