2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Matt Sinclair
36 #ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
37 #define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
39 #include "arch/gcn3/insts/gpu_static_inst.hh"
40 #include "arch/gcn3/insts/op_encodings.hh"
41 #include "debug/GPUMem.hh"
42 #include "gpu-compute/gpu_dyn_inst.hh"
45 * Helper function for instructions declared in op_encodings. This function
46 * takes in all of the arguments for a given memory request we are trying to
47 * initialize, then submits the request or requests depending on if the
48 * original request is aligned or unaligned.
50 template<typename T, int N>
52 initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
56 int req_size = N * sizeof(T);
57 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
58 Addr vaddr = 0, split_addr = 0;
59 bool misaligned_acc = false;
60 RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
61 PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
63 gpuDynInst->resetEntireStatusVector();
64 for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
65 if (gpuDynInst->exec_mask[lane]) {
66 vaddr = gpuDynInst->addr[lane];
69 * the base address of the cache line where the the last
70 * byte of the request will be stored.
72 split_addr = roundDown(vaddr + req_size - 1, block_size);
74 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
76 * if the base cache line address of the last byte is
77 * greater than the address of the first byte then we have
78 * a misaligned access.
80 misaligned_acc = split_addr > vaddr;
83 // make sure request is word aligned
84 assert((vaddr & 0x3) == 0);
86 // a given lane's atomic can't cross cache lines
87 assert(!misaligned_acc);
89 req = std::make_shared<Request>(vaddr, sizeof(T), 0,
90 gpuDynInst->computeUnit()->masterId(), 0,
92 gpuDynInst->makeAtomicOpFunctor<T>(
93 &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
94 &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
96 req = std::make_shared<Request>(vaddr, req_size, 0,
97 gpuDynInst->computeUnit()->masterId(), 0,
101 if (misaligned_acc) {
102 gpuDynInst->setStatusVector(lane, 2);
103 req->splitOnVaddr(split_addr, req1, req2);
104 gpuDynInst->setRequestFlags(req1);
105 gpuDynInst->setRequestFlags(req2);
106 pkt1 = new Packet(req1, mem_req_type);
107 pkt2 = new Packet(req2, mem_req_type);
108 pkt1->dataStatic(&(reinterpret_cast<T*>(
109 gpuDynInst->d_data))[lane * N]);
110 pkt2->dataStatic(&(reinterpret_cast<T*>(
111 gpuDynInst->d_data))[lane * N + req1->getSize()]);
112 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
113 "request for %#x\n", gpuDynInst->cu_id,
114 gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
116 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
117 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
119 gpuDynInst->setStatusVector(lane, 1);
120 gpuDynInst->setRequestFlags(req);
121 pkt = new Packet(req, mem_req_type);
122 pkt->dataStatic(&(reinterpret_cast<T*>(
123 gpuDynInst->d_data))[lane * N]);
124 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
126 } else { // if lane is not active, then no pending requests
127 gpuDynInst->setStatusVector(lane, 0);
133 * Helper function for scalar instructions declared in op_encodings. This
134 * function takes in all of the arguments for a given memory request we are
135 * trying to initialize, then submits the request or requests depending on if
136 * the original request is aligned or unaligned.
138 template<typename T, int N>
140 initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
142 int req_size = N * sizeof(T);
143 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
144 Addr vaddr = gpuDynInst->scalarAddr;
147 * the base address of the cache line where the the last byte of
148 * the request will be stored.
150 Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
152 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
154 * if the base cache line address of the last byte is greater
155 * than the address of the first byte then we have a misaligned
158 bool misaligned_acc = split_addr > vaddr;
160 RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
161 gpuDynInst->computeUnit()->masterId(), 0,
162 gpuDynInst->wfDynId);
164 if (misaligned_acc) {
165 RequestPtr req1, req2;
166 req->splitOnVaddr(split_addr, req1, req2);
167 gpuDynInst->numScalarReqs = 2;
168 gpuDynInst->setRequestFlags(req1);
169 gpuDynInst->setRequestFlags(req2);
170 PacketPtr pkt1 = new Packet(req1, mem_req_type);
171 PacketPtr pkt2 = new Packet(req2, mem_req_type);
172 pkt1->dataStatic(gpuDynInst->scalar_data);
173 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
174 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
175 " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
176 gpuDynInst->wfSlotId, split_addr);
177 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
178 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
180 gpuDynInst->numScalarReqs = 1;
181 gpuDynInst->setRequestFlags(req);
182 PacketPtr pkt = new Packet(req, mem_req_type);
183 pkt->dataStatic(gpuDynInst->scalar_data);
184 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
188 #endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__