From 9a01d3e92765f14055756457403bbe4ecfce2d5b Mon Sep 17 00:00:00 2001 From: Daniel Gerzhoy Date: Wed, 4 Nov 2020 11:51:46 -0500 Subject: [PATCH] dev-hsa,gpu-compute: Agent Packet handler implemented. HSA packet processor will now accept and process agent packets. Type field in packet is command type. For now: AgentCmd::Nop = 0 AgentCmd::Steal = 1 Steal command steals the completion signal for a running kernel. This enables a benchmark to use hsa primitives to send an agent packet to steal the signal, then wait on that signal. Minimal working example to be added in gem5-resources. Change-Id: I37f8a4b7ea1780b471559aecbf4af1050353b0b1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/37015 Reviewed-by: Matt Sinclair Reviewed-by: Matthew Poremba Maintainer: Matt Sinclair Tested-by: kokoro --- src/dev/hsa/hsa_device.hh | 12 +++++ src/dev/hsa/hsa_packet_processor.cc | 61 ++++++++++++++++++++++++ src/dev/hsa/hsa_packet_processor.hh | 4 ++ src/gpu-compute/SConscript | 1 + src/gpu-compute/dispatcher.cc | 10 +++- src/gpu-compute/gpu_command_processor.cc | 52 ++++++++++++++++++++ src/gpu-compute/gpu_command_processor.hh | 7 +++ src/gpu-compute/shader.cc | 7 +++ 8 files changed, 153 insertions(+), 1 deletion(-) diff --git a/src/dev/hsa/hsa_device.hh b/src/dev/hsa/hsa_device.hh index 7e8f1b7bd..68cbd8255 100644 --- a/src/dev/hsa/hsa_device.hh +++ b/src/dev/hsa/hsa_device.hh @@ -56,6 +56,18 @@ class HSADevice : public DmaDevice HSAPacketProcessor& hsaPacketProc(); + /** + * submitAgentDispatchPkt() accepts AQL dispatch packets from the HSA + * packet processor. Not all devices will accept AQL dispatch packets, + * so the default implementation will fatal. + * Implementation added to steal kernel signals. + */ + virtual void + submitAgentDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr) + { + fatal("%s does not accept dispatch packets\n", name()); + } + /** * submitDispatchPkt() accepts AQL dispatch packets from the HSA packet * processor. Not all devices will accept AQL dispatch packets, so the diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index 91d24e5f1..bba3c5940 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -432,6 +432,14 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR"); } else if (pkt_type == HSA_PACKET_TYPE_INVALID) { fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID"); + } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) { + DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \ + " active list ID = %d\n", __FUNCTION__, rl_idx); + // Submit packet to HSA device (dispatcher) + hsa_device->submitAgentDispatchPkt( + (void *)disp_pkt, rl_idx, host_pkt_addr); + is_submitted = UNBLOCKED; + sendAgentDispatchCompletionSignal((void *)disp_pkt,0); } else { fatal("Unsupported packet type %d\n", pkt_type); } @@ -700,3 +708,56 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx) // multi-process support } } + +void +HSAPacketProcessor::sendAgentDispatchCompletionSignal( + void *pkt, hsa_signal_value_t signal) +{ + auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt; + uint64_t signal_addr = + (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1); + DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \ + " completion signal: %x!\n", signal_addr); + /** + * HACK: The semantics of the HSA signal is to + * decrement the current signal value. + * I'm going to cheat here and read out + * the value from main memory using functional + * access, and then just DMA the decremented value. + * The reason for this is that the DMASequencer does + * not support atomic operations. + */ + VPtr prev_signal(signal_addr, sys->threads[0]); + + DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n", + (uint64_t)sys->threads[0]->cpuId()); + + + hsa_signal_value_t *new_signal = new hsa_signal_value_t; + *new_signal = (hsa_signal_value_t) *prev_signal - 1; + + dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0); +} + +void +HSAPacketProcessor::sendCompletionSignal(hsa_signal_value_t signal) +{ + uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1); + DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n", + signal_addr); + /** + * HACK: The semantics of the HSA signal is to + * decrement the current signal value. + * I'm going to cheat here and read out + * the value from main memory using functional + * access, and then just DMA the decremented value. + * The reason for this is that the DMASequencer does + * not support atomic operations. + */ + VPtr prev_signal(signal_addr, sys->threads[0]); + + hsa_signal_value_t *new_signal = new hsa_signal_value_t; + *new_signal = (hsa_signal_value_t) *prev_signal - 1; + + dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0); +} diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index 27df90a85..0f82e9253 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -329,6 +329,10 @@ class HSAPacketProcessor: public DmaDevice void schedAQLProcessing(uint32_t rl_idx); void schedAQLProcessing(uint32_t rl_idx, Tick delay); + void sendAgentDispatchCompletionSignal(void *pkt, + hsa_signal_value_t signal); + void sendCompletionSignal(hsa_signal_value_t signal); + class DepSignalsReadDmaEvent : public Event { protected: diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 0f1afbcca..416b9e924 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -71,6 +71,7 @@ Source('tlb_coalescer.cc') Source('vector_register_file.cc') Source('wavefront.cc') +DebugFlag('GPUAgentDisp') DebugFlag('GPUCoalescer') DebugFlag('GPUCommandProc') DebugFlag('GPUDriver') diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 17c74f0ec..a4fe92385 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -34,6 +34,7 @@ #include "gpu-compute/dispatcher.hh" +#include "debug/GPUAgentDisp.hh" #include "debug/GPUDisp.hh" #include "debug/GPUKernelInfo.hh" #include "debug/GPUWgLatency.hh" @@ -130,6 +131,8 @@ GPUDispatcher::dispatch(HSAQueueEntry *task) DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n", task->kernelName(), task->dispatchId()); + DPRINTF(GPUAgentDisp, "launching kernel: %s, dispatch ID: %d\n", + task->kernelName(), task->dispatchId()); execIds.push(task->dispatchId()); dispatchActive = true; @@ -144,6 +147,7 @@ void GPUDispatcher::exec() { int fail_count(0); + int disp_count(0); /** * There are potentially multiple outstanding kernel launches. @@ -151,6 +155,7 @@ GPUDispatcher::exec() * can fit on the GPU even if another kernel's workgroups cannot */ DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); + DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size()); if (execIds.size() > 0) { ++cyclesWaitingForDispatch; @@ -204,7 +209,7 @@ GPUDispatcher::exec() /** * if we failed try the next kernel, * it may have smaller workgroups. - * put it on the queue to rety latter + * put it on the queue to retry later */ DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id); execIds.push(exec_id); @@ -212,6 +217,7 @@ GPUDispatcher::exec() break; } else if (!launched) { launched = true; + disp_count++; DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id); } } @@ -221,6 +227,8 @@ GPUDispatcher::exec() } DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); + DPRINTF(GPUWgLatency, "Kernel Wgs dispatched: %d | %d failures\n", + disp_count, fail_count); while (doneIds.size()) { DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front()); diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index 0b7f2fa66..a8c790ab5 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -93,6 +93,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the " "kernel object\n", akc.kernel_code_entry_byte_offset); + DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n", + (uint64_t)tc->cpuId()); + + Addr machine_code_addr = (Addr)disp_pkt->kernel_object + akc.kernel_code_entry_byte_offset; @@ -166,6 +170,54 @@ GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id, hsaPP->finishPkt(raw_pkt, queue_id); } +/** + * submitAgentDispatchPkt() is for accepting agent dispatch packets. + * These packets will control the dispatch of Wg on the device, and inform + * the host when a specified number of Wg have been executed on the device. + * + * For now it simply finishes the pkt. + */ +void +GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) +{ + //Parse the Packet, see what it wants us to do + _hsa_agent_dispatch_packet_t * agent_pkt = + (_hsa_agent_dispatch_packet_t *)raw_pkt; + + if (agent_pkt->type == AgentCmd::Nop) { + DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n"); + } else if (agent_pkt->type == AgentCmd::Steal) { + //This is where we steal the HSA Task's completion signal + int kid = agent_pkt->arg[0]; + DPRINTF(GPUCommandProc, + "Agent Dispatch Packet Stealing signal handle for kernel %d\n", + kid); + + HSAQueueEntry *task = dispatcher.hsaTask(kid); + uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t); + + uint64_t return_address = agent_pkt->return_address; + DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address); + //*return_address = signal_addr; + Addr *new_signal_addr = new Addr; + *new_signal_addr = (Addr)signal_addr; + dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0); + + DPRINTF(GPUCommandProc, + "Agent Dispatch Packet Stealing signal handle from kid %d :" \ + "(%x:%x) writing into %x\n", + kid,signal_addr,new_signal_addr,return_address); + + } else + { + panic("The agent dispatch packet provided an unknown argument in" \ + "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted"); + } + + hsaPP->finishPkt(raw_pkt, queue_id); +} + /** * Once the CP has finished extracting all relevant information about * a task and has initialized the ABI state, we send a description of diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index d38ee1f0b..071bd89c5 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -65,6 +65,13 @@ class GPUCommandProcessor : public HSADevice void setShader(Shader *shader); Shader* shader(); + enum AgentCmd { + Nop = 0, + Steal = 1 + }; + + void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) override; void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) override; void submitVendorPkt(void *raw_pkt, uint32_t queue_id, diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 365edd44b..012b9870c 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -38,6 +38,7 @@ #include "arch/x86/isa_traits.hh" #include "arch/x86/linux/linux.hh" #include "base/chunk_generator.hh" +#include "debug/GPUAgentDisp.hh" #include "debug/GPUDisp.hh" #include "debug/GPUMem.hh" #include "debug/GPUShader.hh" @@ -231,6 +232,7 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task) bool scheduledSomething = false; int cuCount = 0; int curCu = nextSchedCu; + int disp_count(0); while (cuCount < n_cu) { //Every time we try a CU, update nextSchedCu @@ -245,6 +247,8 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task) scheduledSomething = true; DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n", curCu, task->globalWgId()); + DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n", + curCu, task->globalWgId()); DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n", curTick(), task->globalWgId(), curCu); @@ -259,12 +263,15 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task) cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg); task->markWgDispatch(); + ++disp_count; } ++cuCount; curCu = nextSchedCu; } + DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count); + return scheduledSomething; } -- 2.30.2