swr: [rasterizer core/jitter] fix alpha test bug
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / blend_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "blend_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34
35 #include <sstream>
36
37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38 #define QUANTIZE_THRESHOLD 2
39
40 using namespace llvm;
41 using namespace SwrJit;
42
43 //////////////////////////////////////////////////////////////////////////
44 /// Interface to Jitting a blend shader
45 //////////////////////////////////////////////////////////////////////////
46 struct BlendJit : public Builder
47 {
48 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
49
50 template<bool Color, bool Alpha>
51 void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
52 {
53 Value* out[4];
54
55 switch (factor)
56 {
57 case BLENDFACTOR_ONE:
58 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
59 break;
60 case BLENDFACTOR_SRC_COLOR:
61 out[0] = src[0];
62 out[1] = src[1];
63 out[2] = src[2];
64 out[3] = src[3];
65 break;
66 case BLENDFACTOR_SRC_ALPHA:
67 out[0] = out[1] = out[2] = out[3] = src[3];
68 break;
69 case BLENDFACTOR_DST_ALPHA:
70 out[0] = out[1] = out[2] = out[3] = dst[3];
71 break;
72 case BLENDFACTOR_DST_COLOR:
73 out[0] = dst[0];
74 out[1] = dst[1];
75 out[2] = dst[2];
76 out[3] = dst[3];
77 break;
78 case BLENDFACTOR_SRC_ALPHA_SATURATE:
79 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
80 out[3] = VIMMED1(1.0f);
81 break;
82 case BLENDFACTOR_CONST_COLOR:
83 out[0] = constColor[0];
84 out[1] = constColor[1];
85 out[2] = constColor[2];
86 out[3] = constColor[3];
87 break;
88 case BLENDFACTOR_CONST_ALPHA:
89 out[0] = out[1] = out[2] = out[3] = constColor[3];
90 break;
91 case BLENDFACTOR_SRC1_COLOR:
92 out[0] = src1[0];
93 out[1] = src1[1];
94 out[2] = src1[2];
95 out[3] = src1[3];
96 break;
97 case BLENDFACTOR_SRC1_ALPHA:
98 out[0] = out[1] = out[2] = out[3] = src1[3];
99 break;
100 case BLENDFACTOR_ZERO:
101 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
102 break;
103 case BLENDFACTOR_INV_SRC_COLOR:
104 out[0] = FSUB(VIMMED1(1.0f), src[0]);
105 out[1] = FSUB(VIMMED1(1.0f), src[1]);
106 out[2] = FSUB(VIMMED1(1.0f), src[2]);
107 out[3] = FSUB(VIMMED1(1.0f), src[3]);
108 break;
109 case BLENDFACTOR_INV_SRC_ALPHA:
110 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
111 break;
112 case BLENDFACTOR_INV_DST_ALPHA:
113 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
114 break;
115 case BLENDFACTOR_INV_DST_COLOR:
116 out[0] = FSUB(VIMMED1(1.0f), dst[0]);
117 out[1] = FSUB(VIMMED1(1.0f), dst[1]);
118 out[2] = FSUB(VIMMED1(1.0f), dst[2]);
119 out[3] = FSUB(VIMMED1(1.0f), dst[3]);
120 break;
121 case BLENDFACTOR_INV_CONST_COLOR:
122 out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
123 out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
124 out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
125 out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
126 break;
127 case BLENDFACTOR_INV_CONST_ALPHA:
128 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
129 break;
130 case BLENDFACTOR_INV_SRC1_COLOR:
131 out[0] = FSUB(VIMMED1(1.0f), src1[0]);
132 out[1] = FSUB(VIMMED1(1.0f), src1[1]);
133 out[2] = FSUB(VIMMED1(1.0f), src1[2]);
134 out[3] = FSUB(VIMMED1(1.0f), src1[3]);
135 break;
136 case BLENDFACTOR_INV_SRC1_ALPHA:
137 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
138 break;
139 default:
140 SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
141 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
142 break;
143 }
144
145 if (Color)
146 {
147 result[0] = out[0];
148 result[1] = out[1];
149 result[2] = out[2];
150 }
151
152 if (Alpha)
153 {
154 result[3] = out[3];
155 }
156 }
157
158 void Clamp(SWR_FORMAT format, Value* src[4])
159 {
160 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
161 SWR_TYPE type = info.type[0];
162
163 switch (type)
164 {
165 case SWR_TYPE_FLOAT:
166 break;
167
168 case SWR_TYPE_UNORM:
169 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
170 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
171 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
172 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
173 break;
174
175 case SWR_TYPE_SNORM:
176 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
177 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
178 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
179 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
180 break;
181
182 default: SWR_ASSERT(false, "Unsupport format type: %d", type);
183 }
184 }
185
186 void ApplyDefaults(SWR_FORMAT format, Value* src[4])
187 {
188 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
189
190 bool valid[] = { false, false, false, false };
191 for (uint32_t c = 0; c < info.numComps; ++c)
192 {
193 valid[info.swizzle[c]] = true;
194 }
195
196 for (uint32_t c = 0; c < 4; ++c)
197 {
198 if (!valid[c])
199 {
200 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
201 }
202 }
203 }
204
205 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
206 {
207 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
208
209 for (uint32_t c = 0; c < info.numComps; ++c)
210 {
211 if (info.type[c] == SWR_TYPE_UNUSED)
212 {
213 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
214 }
215 }
216 }
217
218 void Quantize(SWR_FORMAT format, Value* src[4])
219 {
220 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
221 for (uint32_t c = 0; c < info.numComps; ++c)
222 {
223 if (info.bpc[c] <= QUANTIZE_THRESHOLD)
224 {
225 uint32_t swizComp = info.swizzle[c];
226 float factor = (float)((1 << info.bpc[c]) - 1);
227 switch (info.type[c])
228 {
229 case SWR_TYPE_UNORM:
230 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
231 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
232 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
233 break;
234 default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
235 }
236 }
237 }
238 }
239
240 template<bool Color, bool Alpha>
241 void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
242 {
243 Value* out[4];
244 Value* srcBlend[4];
245 Value* dstBlend[4];
246 for (uint32_t i = 0; i < 4; ++i)
247 {
248 srcBlend[i] = FMUL(src[i], srcFactor[i]);
249 dstBlend[i] = FMUL(dst[i], dstFactor[i]);
250 }
251
252 switch (blendOp)
253 {
254 case BLENDOP_ADD:
255 out[0] = FADD(srcBlend[0], dstBlend[0]);
256 out[1] = FADD(srcBlend[1], dstBlend[1]);
257 out[2] = FADD(srcBlend[2], dstBlend[2]);
258 out[3] = FADD(srcBlend[3], dstBlend[3]);
259 break;
260
261 case BLENDOP_SUBTRACT:
262 out[0] = FSUB(srcBlend[0], dstBlend[0]);
263 out[1] = FSUB(srcBlend[1], dstBlend[1]);
264 out[2] = FSUB(srcBlend[2], dstBlend[2]);
265 out[3] = FSUB(srcBlend[3], dstBlend[3]);
266 break;
267
268 case BLENDOP_REVSUBTRACT:
269 out[0] = FSUB(dstBlend[0], srcBlend[0]);
270 out[1] = FSUB(dstBlend[1], srcBlend[1]);
271 out[2] = FSUB(dstBlend[2], srcBlend[2]);
272 out[3] = FSUB(dstBlend[3], srcBlend[3]);
273 break;
274
275 case BLENDOP_MIN:
276 out[0] = VMINPS(src[0], dst[0]);
277 out[1] = VMINPS(src[1], dst[1]);
278 out[2] = VMINPS(src[2], dst[2]);
279 out[3] = VMINPS(src[3], dst[3]);
280 break;
281
282 case BLENDOP_MAX:
283 out[0] = VMAXPS(src[0], dst[0]);
284 out[1] = VMAXPS(src[1], dst[1]);
285 out[2] = VMAXPS(src[2], dst[2]);
286 out[3] = VMAXPS(src[3], dst[3]);
287 break;
288
289 default:
290 SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
291 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
292 break;
293 }
294
295 if (Color)
296 {
297 result[0] = out[0];
298 result[1] = out[1];
299 result[2] = out[2];
300 }
301
302 if (Alpha)
303 {
304 result[3] = out[3];
305 }
306 }
307
308 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
309 {
310 // Op: (s == PS output, d = RT contents)
311 switch(logicOp)
312 {
313 case LOGICOP_CLEAR:
314 result[0] = VIMMED1(0);
315 result[1] = VIMMED1(0);
316 result[2] = VIMMED1(0);
317 result[3] = VIMMED1(0);
318 break;
319
320 case LOGICOP_NOR:
321 // ~(s | d)
322 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
323 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
324 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
325 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
326 break;
327
328 case LOGICOP_AND_INVERTED:
329 // ~s & d
330 // todo: use avx andnot instr when I can find the intrinsic to call
331 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
332 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
333 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
334 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
335 break;
336
337 case LOGICOP_COPY_INVERTED:
338 // ~s
339 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
340 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
341 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
342 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
343 break;
344
345 case LOGICOP_AND_REVERSE:
346 // s & ~d
347 // todo: use avx andnot instr when I can find the intrinsic to call
348 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
349 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
350 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
351 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
352 break;
353
354 case LOGICOP_INVERT:
355 // ~d
356 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
357 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
358 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
359 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
360 break;
361
362 case LOGICOP_XOR:
363 // s ^ d
364 result[0] = XOR(src[0], dst[0]);
365 result[1] = XOR(src[1], dst[1]);
366 result[2] = XOR(src[2], dst[2]);
367 result[3] = XOR(src[3], dst[3]);
368 break;
369
370 case LOGICOP_NAND:
371 // ~(s & d)
372 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
373 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
374 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
375 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
376 break;
377
378 case LOGICOP_AND:
379 // s & d
380 result[0] = AND(src[0], dst[0]);
381 result[1] = AND(src[1], dst[1]);
382 result[2] = AND(src[2], dst[2]);
383 result[3] = AND(src[3], dst[3]);
384 break;
385
386 case LOGICOP_EQUIV:
387 // ~(s ^ d)
388 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
389 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
390 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
391 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
392 break;
393
394 case LOGICOP_NOOP:
395 result[0] = dst[0];
396 result[1] = dst[1];
397 result[2] = dst[2];
398 result[3] = dst[3];
399 break;
400
401 case LOGICOP_OR_INVERTED:
402 // ~s | d
403 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
404 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
405 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
406 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
407 break;
408
409 case LOGICOP_COPY:
410 result[0] = src[0];
411 result[1] = src[1];
412 result[2] = src[2];
413 result[3] = src[3];
414 break;
415
416 case LOGICOP_OR_REVERSE:
417 // s | ~d
418 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
419 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
420 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
421 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
422 break;
423
424 case LOGICOP_OR:
425 // s | d
426 result[0] = OR(src[0], dst[0]);
427 result[1] = OR(src[1], dst[1]);
428 result[2] = OR(src[2], dst[2]);
429 result[3] = OR(src[3], dst[3]);
430 break;
431
432 case LOGICOP_SET:
433 result[0] = VIMMED1(0xFFFFFFFF);
434 result[1] = VIMMED1(0xFFFFFFFF);
435 result[2] = VIMMED1(0xFFFFFFFF);
436 result[3] = VIMMED1(0xFFFFFFFF);
437 break;
438
439 default:
440 SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
441 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
442 break;
443 }
444 }
445
446 void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
447 {
448 // load uint32_t reference
449 Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
450
451 // load alpha
452 Value* pAlpha = LOAD(ppAlpha);
453
454 Value* pTest = nullptr;
455 if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
456 {
457 // convert float alpha to unorm8
458 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
459 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
460
461 // compare
462 switch (state.alphaTestFunction)
463 {
464 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
465 case ZFUNC_NEVER: pTest = VIMMED1(false); break;
466 case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break;
467 case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break;
468 case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break;
469 case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break;
470 case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break;
471 case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break;
472 default:
473 SWR_ASSERT(false, "Invalid alpha test function");
474 break;
475 }
476 }
477 else
478 {
479 // cast ref to float
480 pRef = BITCAST(pRef, mSimdFP32Ty);
481
482 // compare
483 switch (state.alphaTestFunction)
484 {
485 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
486 case ZFUNC_NEVER: pTest = VIMMED1(false); break;
487 case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break;
488 case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break;
489 case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break;
490 case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break;
491 case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break;
492 case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break;
493 default:
494 SWR_ASSERT(false, "Invalid alpha test function");
495 break;
496 }
497 }
498
499 // load current mask
500 Value* pMask = LOAD(ppMask);
501
502 // convert to int1 mask
503 pMask = MASK(pMask);
504
505 // and with alpha test result
506 pMask = AND(pMask, pTest);
507
508 // convert back to vector mask
509 pMask = VMASK(pMask);
510
511 // store new mask
512 STORE(pMask, ppMask);
513 }
514
515 Function* Create(const BLEND_COMPILE_STATE& state)
516 {
517 static std::size_t jitNum = 0;
518
519 std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
520 fnName << jitNum++;
521
522 // blend function signature
523 //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
524
525 std::vector<Type*> args{
526 PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
527 PointerType::get(mSimdFP32Ty, 0), // simdvector& src
528 PointerType::get(mSimdFP32Ty, 0), // simdvector& src1
529 PointerType::get(mSimdFP32Ty, 0), // src0alpha
530 Type::getInt32Ty(JM()->mContext), // sampleNum
531 PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst
532 PointerType::get(mSimdFP32Ty, 0), // simdvector& result
533 PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask
534 PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask
535 };
536
537 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
538 Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
539
540 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
541
542 IRB()->SetInsertPoint(entry);
543
544 // arguments
545 auto argitr = blendFunc->getArgumentList().begin();
546 Value* pBlendState = &*argitr++;
547 pBlendState->setName("pBlendState");
548 Value* pSrc = &*argitr++;
549 pSrc->setName("src");
550 Value* pSrc1 = &*argitr++;
551 pSrc1->setName("src1");
552 Value* pSrc0Alpha = &*argitr++;
553 pSrc0Alpha->setName("src0alpha");
554 Value* sampleNum = &*argitr++;
555 sampleNum->setName("sampleNum");
556 Value* pDst = &*argitr++;
557 pDst->setName("pDst");
558 Value* pResult = &*argitr++;
559 pResult->setName("result");
560 Value* ppoMask = &*argitr++;
561 ppoMask->setName("ppoMask");
562 Value* ppMask = &*argitr++;
563 ppMask->setName("pMask");
564
565 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
566 Value* dst[4];
567 Value* constantColor[4];
568 Value* src[4];
569 Value* src1[4];
570 Value* result[4];
571 for (uint32_t i = 0; i < 4; ++i)
572 {
573 // load hot tile
574 dst[i] = LOAD(pDst, { i });
575
576 // load constant color
577 constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
578
579 // load src
580 src[i] = LOAD(pSrc, { i });
581
582 // load src1
583 src1[i] = LOAD(pSrc1, { i });
584 }
585 Value* currentMask = VIMMED1(-1);
586 if (state.desc.alphaToCoverageEnable)
587 {
588 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
589 uint32_t bits = (1 << state.desc.numSamples) - 1;
590 currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
591 currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
592 }
593
594 // alpha test
595 if (state.desc.alphaTestEnable)
596 {
597 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
598 }
599
600 // color blend
601 if (state.blendState.blendEnable)
602 {
603 // clamp sources
604 Clamp(state.format, src);
605 Clamp(state.format, src1);
606 Clamp(state.format, dst);
607 Clamp(state.format, constantColor);
608
609 // apply defaults to hottile contents to take into account missing components
610 ApplyDefaults(state.format, dst);
611
612 // Force defaults for unused 'X' components
613 ApplyUnusedDefaults(state.format, dst);
614
615 // Quantize low precision components
616 Quantize(state.format, dst);
617
618 // special case clamping for R11G11B10_float which has no sign bit
619 if (state.format == R11G11B10_FLOAT)
620 {
621 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
622 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
623 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
624 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
625 }
626
627 Value* srcFactor[4];
628 Value* dstFactor[4];
629 if (state.desc.independentAlphaBlendEnable)
630 {
631 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
632 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
633
634 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
635 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
636
637 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
638 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
639 }
640 else
641 {
642 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
643 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
644
645 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
646 }
647
648 // store results out
649 for (uint32_t i = 0; i < 4; ++i)
650 {
651 STORE(result[i], pResult, { i });
652 }
653 }
654
655 if(state.blendState.logicOpEnable)
656 {
657 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
658 Value* vMask[4];
659 float scale[4];
660
661 if (!state.blendState.blendEnable)
662 {
663 Clamp(state.format, src);
664 Clamp(state.format, dst);
665 }
666
667 for(uint32_t i = 0; i < 4; i++)
668 {
669 if (info.type[i] == SWR_TYPE_UNUSED)
670 {
671 continue;
672 }
673
674 if (info.bpc[i] >= 32) {
675 vMask[i] = VIMMED1(0xFFFFFFFF);
676 scale[i] = 0xFFFFFFFF;
677 } else {
678 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
679 if (info.type[i] == SWR_TYPE_SNORM)
680 scale[i] = (1 << (info.bpc[i] - 1)) - 1;
681 else
682 scale[i] = (1 << info.bpc[i]) - 1;
683 }
684
685 switch (info.type[i]) {
686 default:
687 SWR_ASSERT(0, "Unsupported type for logic op\n");
688 /* fallthrough */
689 case SWR_TYPE_UINT:
690 case SWR_TYPE_SINT:
691 src[i] = BITCAST(src[i], mSimdInt32Ty);
692 dst[i] = BITCAST(dst[i], mSimdInt32Ty);
693 break;
694 case SWR_TYPE_SNORM:
695 src[i] = FADD(src[i], VIMMED1(0.5f));
696 dst[i] = FADD(dst[i], VIMMED1(0.5f));
697 /* fallthrough */
698 case SWR_TYPE_UNORM:
699 src[i] = FP_TO_UI(
700 FMUL(src[i], VIMMED1(scale[i])),
701 mSimdInt32Ty);
702 dst[i] = FP_TO_UI(
703 FMUL(dst[i], VIMMED1(scale[i])),
704 mSimdInt32Ty);
705 break;
706 }
707 }
708
709 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
710
711 // store results out
712 for(uint32_t i = 0; i < 4; ++i)
713 {
714 if (info.type[i] == SWR_TYPE_UNUSED)
715 {
716 continue;
717 }
718
719 // clear upper bits from PS output not in RT format after doing logic op
720 result[i] = AND(result[i], vMask[i]);
721
722 switch (info.type[i]) {
723 default:
724 SWR_ASSERT(0, "Unsupported type for logic op\n");
725 /* fallthrough */
726 case SWR_TYPE_UINT:
727 case SWR_TYPE_SINT:
728 result[i] = BITCAST(result[i], mSimdFP32Ty);
729 break;
730 case SWR_TYPE_SNORM:
731 case SWR_TYPE_UNORM:
732 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
733 VIMMED1(1.0f / scale[i]));
734 if (info.type[i] == SWR_TYPE_SNORM)
735 result[i] = FADD(result[i], VIMMED1(-0.5f));
736 break;
737 }
738
739 STORE(result[i], pResult, {i});
740 }
741 }
742
743 if(state.desc.oMaskEnable)
744 {
745 assert(!(state.desc.alphaToCoverageEnable));
746 // load current mask
747 Value* oMask = LOAD(ppoMask);
748 Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
749 oMask = AND(oMask, sampleMasked);
750 currentMask = AND(oMask, currentMask);
751 }
752
753 if(state.desc.sampleMaskEnable)
754 {
755 Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
756 Value* sampleMasked = SHL(C(1), sampleNum);
757 sampleMask = AND(sampleMask, sampleMasked);
758 sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
759 sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
760 currentMask = AND(sampleMask, currentMask);
761 }
762
763 if (state.desc.alphaToCoverageEnable)
764 {
765 Value* sampleMasked = SHL(C(1), sampleNum);
766 currentMask = AND(currentMask, VBROADCAST(sampleMasked));
767 }
768
769 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
770 state.desc.oMaskEnable)
771 {
772 // load current mask
773 Value* pMask = LOAD(ppMask);
774 currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
775 Value* outputMask = AND(pMask, currentMask);
776 // store new mask
777 STORE(outputMask, GEP(ppMask, C(0)));
778 }
779
780 RET_VOID();
781
782 JitManager::DumpToFile(blendFunc, "");
783
784 ::FunctionPassManager passes(JM()->mpCurrentModule);
785
786 passes.add(createBreakCriticalEdgesPass());
787 passes.add(createCFGSimplificationPass());
788 passes.add(createEarlyCSEPass());
789 passes.add(createPromoteMemoryToRegisterPass());
790 passes.add(createCFGSimplificationPass());
791 passes.add(createEarlyCSEPass());
792 passes.add(createInstructionCombiningPass());
793 passes.add(createInstructionSimplifierPass());
794 passes.add(createConstantPropagationPass());
795 passes.add(createSCCPPass());
796 passes.add(createAggressiveDCEPass());
797
798 passes.run(*blendFunc);
799
800 JitManager::DumpToFile(blendFunc, "optimized");
801
802 return blendFunc;
803 }
804 };
805
806 //////////////////////////////////////////////////////////////////////////
807 /// @brief JITs from fetch shader IR
808 /// @param hJitMgr - JitManager handle
809 /// @param func - LLVM function IR
810 /// @return PFN_FETCH_FUNC - pointer to fetch code
811 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
812 {
813 const llvm::Function *func = (const llvm::Function*)hFunc;
814 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
815 PFN_BLEND_JIT_FUNC pfnBlend;
816 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
817 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
818 pJitMgr->mIsModuleFinalized = true;
819
820 return pfnBlend;
821 }
822
823 //////////////////////////////////////////////////////////////////////////
824 /// @brief JIT compiles blend shader
825 /// @param hJitMgr - JitManager handle
826 /// @param state - blend state to build function from
827 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
828 {
829 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
830
831 pJitMgr->SetupNewModule();
832
833 BlendJit theJit(pJitMgr);
834 HANDLE hFunc = theJit.Create(state);
835
836 return JitBlendFunc(hJitMgr, hFunc);
837 }