gallium/swr: add OpenSWR rasterizer
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / streamout_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file streamout_jit.cpp
24 *
25 * @brief Implementation of the streamout jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "streamout_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 #include "common/containers.hpp"
35 #include "llvm/IR/DataLayout.h"
36
37 #include <sstream>
38 #include <unordered_set>
39
40 //////////////////////////////////////////////////////////////////////////
41 /// Interface to Jitting a fetch shader
42 //////////////////////////////////////////////////////////////////////////
43 struct StreamOutJit : public Builder
44 {
45 StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
46
47 // returns pointer to SWR_STREAMOUT_BUFFER
48 Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
49 {
50 return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
51 }
52
53
54 //////////////////////////////////////////////////////////////////////////
55 // @brief checks if streamout buffer is oob
56 // @return <i1> true/false
57 Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
58 {
59 Value* returnMask = C(false);
60
61 Value* pBuf = getSOBuffer(pSoCtx, buffer);
62
63 // load enable
64 // @todo bool data types should generate <i1> llvm type
65 Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
66
67 // load buffer size
68 Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
69
70 // load current streamOffset
71 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
72
73 // load buffer pitch
74 Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
75
76 // buffer is considered oob if in use in a decl but not enabled
77 returnMask = OR(returnMask, NOT(enabled));
78
79 // buffer is oob if cannot fit a prims worth of verts
80 Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
81 returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
82
83 return returnMask;
84 }
85
86
87 //////////////////////////////////////////////////////////////////////////
88 // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
89 // packing the active mask bits
90 // ex. bitmask 0011 -> (0, 1, 0, 0)
91 // bitmask 1000 -> (3, 0, 0, 0)
92 // bitmask 1100 -> (2, 3, 0, 0)
93 Value* PackMask(uint32_t bitmask)
94 {
95 std::vector<Constant*> indices(4, C(0));
96 DWORD index;
97 uint32_t elem = 0;
98 while (_BitScanForward(&index, bitmask))
99 {
100 indices[elem++] = C((int)index);
101 bitmask &= ~(1 << index);
102 }
103
104 return ConstantVector::get(indices);
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 // @brief convert scalar bitmask to <4xfloat> bitmask
109 Value* ToMask(uint32_t bitmask)
110 {
111 std::vector<Constant*> indices;
112 for (uint32_t i = 0; i < 4; ++i)
113 {
114 if (bitmask & (1 << i))
115 {
116 indices.push_back(C(-1.0f));
117 }
118 else
119 {
120 indices.push_back(C(0.0f));
121 }
122 }
123 return ConstantVector::get(indices);
124 }
125
126 //////////////////////////////////////////////////////////////////////////
127 // @brief processes a single decl from the streamout stream. Reads 4 components from the input
128 // stream and writes N components to the output buffer given the componentMask or if
129 // a hole, just increments the buffer pointer
130 // @param pStream - pointer to current attribute
131 // @param pOutBuffers - pointers to the current location of each output buffer
132 // @param decl - input decl
133 void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
134 {
135 // @todo add this to x86 macros
136 Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
137
138 uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
139 uint32_t packedMask = (1 << numComponents) - 1;
140 if (!decl.hole)
141 {
142 // increment stream pointer to correct slot
143 Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
144
145 // load 4 components from stream
146 Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
147 Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
148 pAttrib = BITCAST(pAttrib, simd4PtrTy);
149 Value *vattrib = LOAD(pAttrib);
150
151 // shuffle/pack enabled components
152 Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
153
154 // store to output buffer
155 // cast SO buffer to i8*, needed by maskstore
156 Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
157
158 // cast input to <4xfloat>
159 Value* src = BITCAST(vpackedAttrib, simd4Ty);
160 CALL(maskStore, {pOut, ToMask(packedMask), src});
161 }
162
163 // increment SO buffer
164 pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
165 }
166
167 //////////////////////////////////////////////////////////////////////////
168 // @brief builds a single vertex worth of data for the given stream
169 // @param streamState - state for this stream
170 // @param pCurVertex - pointer to src stream vertex data
171 // @param pOutBuffer - pointers to up to 4 SO buffers
172 void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
173 {
174 for (uint32_t d = 0; d < streamState.numDecls; ++d)
175 {
176 const STREAMOUT_DECL& decl = streamState.decl[d];
177 buildDecl(pCurVertex, pOutBuffer, decl);
178 }
179 }
180
181 void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
182 {
183 // get list of active SO buffers
184 std::unordered_set<uint32_t> activeSOBuffers;
185 for (uint32_t d = 0; d < streamState.numDecls; ++d)
186 {
187 const STREAMOUT_DECL& decl = streamState.decl[d];
188 activeSOBuffers.insert(decl.bufferIndex);
189 }
190
191 // always increment numPrimStorageNeeded
192 Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
193 numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
194 STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
195
196 // check OOB on active SO buffers. If any buffer is out of bound, don't write
197 // the primitive to any buffer
198 Value* oobMask = C(false);
199 for (uint32_t buffer : activeSOBuffers)
200 {
201 oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
202 }
203
204 BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
205
206 // early out if OOB
207 COND_BR(oobMask, returnBB, validBB);
208
209 IRB()->SetInsertPoint(validBB);
210
211 Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
212 numPrimsWritten = ADD(numPrimsWritten, C(1));
213 STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
214
215 // compute start pointer for each output buffer
216 Value* pOutBuffer[4];
217 Value* pOutBufferStartVertex[4];
218 Value* outBufferPitch[4];
219 for (uint32_t b: activeSOBuffers)
220 {
221 Value* pBuf = getSOBuffer(pSoCtx, b);
222 Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
223 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
224 pOutBuffer[b] = GEP(pData, streamOffset);
225 pOutBufferStartVertex[b] = pOutBuffer[b];
226
227 outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
228 }
229
230 // loop over the vertices of the prim
231 Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
232 for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
233 {
234 buildVertex(streamState, pStreamData, pOutBuffer);
235
236 // increment stream and output buffer pointers
237 // stream verts are always 32*4 dwords apart
238 pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
239
240 // output buffers offset using pitch in buffer state
241 for (uint32_t b : activeSOBuffers)
242 {
243 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
244 pOutBuffer[b] = pOutBufferStartVertex[b];
245 }
246 }
247
248 // update each active buffer's streamOffset
249 for (uint32_t b : activeSOBuffers)
250 {
251 Value* pBuf = getSOBuffer(pSoCtx, b);
252 Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
253 streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
254 STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
255 }
256 }
257
258 Function* Create(const STREAMOUT_COMPILE_STATE& state)
259 {
260 static std::size_t soNum = 0;
261
262 std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
263 fnName << soNum++;
264
265 // SO function signature
266 // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
267
268 std::vector<Type*> args{
269 PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
270 };
271
272 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
273 Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
274
275 // create return basic block
276 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
277 BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
278
279 IRB()->SetInsertPoint(entry);
280
281 // arguments
282 auto argitr = soFunc->getArgumentList().begin();
283 Value* pSoCtx = &*argitr++;
284 pSoCtx->setName("pSoCtx");
285
286 const STREAMOUT_STREAM& streamState = state.stream;
287 buildStream(state, streamState, pSoCtx, returnBB, soFunc);
288
289 BR(returnBB);
290
291 IRB()->SetInsertPoint(returnBB);
292 RET_VOID();
293
294 JitManager::DumpToFile(soFunc, "SoFunc");
295
296 FunctionPassManager passes(JM()->mpCurrentModule);
297 passes.add(createBreakCriticalEdgesPass());
298 passes.add(createCFGSimplificationPass());
299 passes.add(createEarlyCSEPass());
300 passes.add(createPromoteMemoryToRegisterPass());
301 passes.add(createCFGSimplificationPass());
302 passes.add(createEarlyCSEPass());
303 passes.add(createInstructionCombiningPass());
304 passes.add(createInstructionSimplifierPass());
305 passes.add(createConstantPropagationPass());
306 passes.add(createSCCPPass());
307 passes.add(createAggressiveDCEPass());
308
309 passes.run(*soFunc);
310
311 JitManager::DumpToFile(soFunc, "SoFunc_optimized");
312
313 return soFunc;
314 }
315 };
316
317 //////////////////////////////////////////////////////////////////////////
318 /// @brief JITs from streamout shader IR
319 /// @param hJitMgr - JitManager handle
320 /// @param func - LLVM function IR
321 /// @return PFN_SO_FUNC - pointer to SOS function
322 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
323 {
324 const llvm::Function *func = (const llvm::Function*)hFunc;
325 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
326 PFN_SO_FUNC pfnStreamOut;
327 pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
328 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
329 pJitMgr->mIsModuleFinalized = true;
330
331 return pfnStreamOut;
332 }
333
334 //////////////////////////////////////////////////////////////////////////
335 /// @brief JIT compiles streamout shader
336 /// @param hJitMgr - JitManager handle
337 /// @param state - SO state to build function from
338 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
339 {
340 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
341
342 STREAMOUT_COMPILE_STATE soState = state;
343 if (soState.offsetAttribs)
344 {
345 for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
346 {
347 soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
348 }
349 }
350
351 pJitMgr->SetupNewModule();
352
353 StreamOutJit theJit(pJitMgr);
354 HANDLE hFunc = theJit.Create(soState);
355
356 return JitStreamoutFunc(hJitMgr, hFunc);
357 }