vk: Add four unit tests for our lock-free data-structures
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions before \p cursor in
68 * basic block \p block, inheriting other code generation parameters
69 * from this.
70 */
71 fs_builder
72 at(bblock_t *block, exec_node *cursor) const
73 {
74 fs_builder bld = *this;
75 bld.block = block;
76 bld.cursor = cursor;
77 return bld;
78 }
79
80 /**
81 * Construct an fs_builder appending instructions at the end of the
82 * instruction list of the shader, inheriting other code generation
83 * parameters from this.
84 */
85 fs_builder
86 at_end() const
87 {
88 return at(NULL, (exec_node *)&shader->instructions.tail);
89 }
90
91 /**
92 * Construct a builder specifying the default SIMD width and group of
93 * channel enable signals, inheriting other code generation parameters
94 * from this.
95 *
96 * \p n gives the default SIMD width, \p i gives the slot group used for
97 * predication and control flow masking in multiples of \p n channels.
98 */
99 fs_builder
100 group(unsigned n, unsigned i) const
101 {
102 assert(n <= dispatch_width() &&
103 i < dispatch_width() / n);
104 fs_builder bld = *this;
105 bld._dispatch_width = n;
106 bld._group += i * n;
107 return bld;
108 }
109
110 /**
111 * Alias for group() with width equal to eight.
112 */
113 fs_builder
114 half(unsigned i) const
115 {
116 return group(8, i);
117 }
118
119 /**
120 * Construct a builder with per-channel control flow execution masking
121 * disabled if \p b is true. If control flow execution masking is
122 * already disabled this has no effect.
123 */
124 fs_builder
125 exec_all(bool b = true) const
126 {
127 fs_builder bld = *this;
128 if (b)
129 bld.force_writemask_all = true;
130 return bld;
131 }
132
133 /**
134 * Construct a builder with the given debug annotation info.
135 */
136 fs_builder
137 annotate(const char *str, const void *ir = NULL) const
138 {
139 fs_builder bld = *this;
140 bld.annotation.str = str;
141 bld.annotation.ir = ir;
142 return bld;
143 }
144
145 /**
146 * Get the SIMD width in use.
147 */
148 unsigned
149 dispatch_width() const
150 {
151 return _dispatch_width;
152 }
153
154 /**
155 * Allocate a virtual register of natural vector size (one for this IR)
156 * and SIMD width. \p n gives the amount of space to allocate in
157 * dispatch_width units (which is just enough space for one logical
158 * component in this IR).
159 */
160 dst_reg
161 vgrf(enum brw_reg_type type, unsigned n = 1) const
162 {
163 return dst_reg(GRF, shader->alloc.allocate(
164 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
165 REG_SIZE)),
166 type, dispatch_width());
167 }
168
169 /**
170 * Create a null register of floating type.
171 */
172 dst_reg
173 null_reg_f() const
174 {
175 return dst_reg(retype(brw_null_vec(dispatch_width()),
176 BRW_REGISTER_TYPE_F));
177 }
178
179 /**
180 * Create a null register of signed integer type.
181 */
182 dst_reg
183 null_reg_d() const
184 {
185 return dst_reg(retype(brw_null_vec(dispatch_width()),
186 BRW_REGISTER_TYPE_D));
187 }
188
189 /**
190 * Create a null register of unsigned integer type.
191 */
192 dst_reg
193 null_reg_ud() const
194 {
195 return dst_reg(retype(brw_null_vec(dispatch_width()),
196 BRW_REGISTER_TYPE_UD));
197 }
198
199 /**
200 * Get the mask of SIMD channels enabled by dispatch and not yet
201 * disabled by discard.
202 */
203 src_reg
204 sample_mask_reg() const
205 {
206 const bool uses_kill =
207 (shader->stage == MESA_SHADER_FRAGMENT &&
208 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
209 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
210 uses_kill ? brw_flag_reg(0, 1) :
211 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
212 }
213
214 /**
215 * Insert an instruction into the program.
216 */
217 instruction *
218 emit(const instruction &inst) const
219 {
220 return emit(new(shader->mem_ctx) instruction(inst));
221 }
222
223 /**
224 * Create and insert a nullary control instruction into the program.
225 */
226 instruction *
227 emit(enum opcode opcode) const
228 {
229 return emit(instruction(opcode, dispatch_width()));
230 }
231
232 /**
233 * Create and insert a nullary instruction into the program.
234 */
235 instruction *
236 emit(enum opcode opcode, const dst_reg &dst) const
237 {
238 return emit(instruction(opcode, dst));
239 }
240
241 /**
242 * Create and insert a unary instruction into the program.
243 */
244 instruction *
245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246 {
247 switch (opcode) {
248 case SHADER_OPCODE_RCP:
249 case SHADER_OPCODE_RSQ:
250 case SHADER_OPCODE_SQRT:
251 case SHADER_OPCODE_EXP2:
252 case SHADER_OPCODE_LOG2:
253 case SHADER_OPCODE_SIN:
254 case SHADER_OPCODE_COS:
255 return fix_math_instruction(
256 emit(instruction(opcode, dst.width, dst,
257 fix_math_operand(src0))));
258
259 default:
260 return emit(instruction(opcode, dst.width, dst, src0));
261 }
262 }
263
264 /**
265 * Create and insert a binary instruction into the program.
266 */
267 instruction *
268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269 const src_reg &src1) const
270 {
271 switch (opcode) {
272 case SHADER_OPCODE_POW:
273 case SHADER_OPCODE_INT_QUOTIENT:
274 case SHADER_OPCODE_INT_REMAINDER:
275 return fix_math_instruction(
276 emit(instruction(opcode, dst.width, dst,
277 fix_math_operand(src0),
278 fix_math_operand(src1))));
279
280 default:
281 return emit(instruction(opcode, dst.width, dst, src0, src1));
282
283 }
284 }
285
286 /**
287 * Create and insert a ternary instruction into the program.
288 */
289 instruction *
290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291 const src_reg &src1, const src_reg &src2) const
292 {
293 switch (opcode) {
294 case BRW_OPCODE_BFE:
295 case BRW_OPCODE_BFI2:
296 case BRW_OPCODE_MAD:
297 case BRW_OPCODE_LRP:
298 return emit(instruction(opcode, dst.width, dst,
299 fix_3src_operand(src0),
300 fix_3src_operand(src1),
301 fix_3src_operand(src2)));
302
303 default:
304 return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
305 }
306 }
307
308 /**
309 * Insert a preallocated instruction into the program.
310 */
311 instruction *
312 emit(instruction *inst) const
313 {
314 assert(inst->exec_size == dispatch_width() ||
315 force_writemask_all);
316 assert(_group == 0 || _group == 8);
317
318 inst->force_sechalf = (_group == 8);
319 inst->force_writemask_all = force_writemask_all;
320 inst->annotation = annotation.str;
321 inst->ir = annotation.ir;
322
323 if (block)
324 static_cast<instruction *>(cursor)->insert_before(block, inst);
325 else
326 cursor->insert_before(inst);
327
328 return inst;
329 }
330
331 /**
332 * Select \p src0 if the comparison of both sources with the given
333 * conditional mod evaluates to true, otherwise select \p src1.
334 *
335 * Generally useful to get the minimum or maximum of two values.
336 */
337 void
338 emit_minmax(const dst_reg &dst, const src_reg &src0,
339 const src_reg &src1, brw_conditional_mod mod) const
340 {
341 if (shader->devinfo->gen >= 6) {
342 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
343 fix_unsigned_negate(src1)));
344 } else {
345 CMP(null_reg_d(), src0, src1, mod);
346 set_predicate(BRW_PREDICATE_NORMAL,
347 SEL(dst, src0, src1));
348 }
349 }
350
351 /**
352 * Copy any live channel from \p src to the first channel of \p dst.
353 */
354 void
355 emit_uniformize(const dst_reg &dst, const src_reg &src) const
356 {
357 const fs_builder ubld = exec_all();
358 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
359
360 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
361 ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
362 src, component(chan_index, 0));
363 }
364
365 /**
366 * Assorted arithmetic ops.
367 * @{
368 */
369 #define ALU1(op) \
370 instruction * \
371 op(const dst_reg &dst, const src_reg &src0) const \
372 { \
373 return emit(BRW_OPCODE_##op, dst, src0); \
374 }
375
376 #define ALU2(op) \
377 instruction * \
378 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
379 { \
380 return emit(BRW_OPCODE_##op, dst, src0, src1); \
381 }
382
383 #define ALU2_ACC(op) \
384 instruction * \
385 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
386 { \
387 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
388 inst->writes_accumulator = true; \
389 return inst; \
390 }
391
392 #define ALU3(op) \
393 instruction * \
394 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
395 const src_reg &src2) const \
396 { \
397 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
398 }
399
400 ALU2(ADD)
401 ALU2_ACC(ADDC)
402 ALU2(AND)
403 ALU2(ASR)
404 ALU2(AVG)
405 ALU3(BFE)
406 ALU2(BFI1)
407 ALU3(BFI2)
408 ALU1(BFREV)
409 ALU1(CBIT)
410 ALU2(CMPN)
411 ALU3(CSEL)
412 ALU2(DP2)
413 ALU2(DP3)
414 ALU2(DP4)
415 ALU2(DPH)
416 ALU1(F16TO32)
417 ALU1(F32TO16)
418 ALU1(FBH)
419 ALU1(FBL)
420 ALU1(FRC)
421 ALU2(LINE)
422 ALU1(LZD)
423 ALU2(MAC)
424 ALU2_ACC(MACH)
425 ALU3(MAD)
426 ALU1(MOV)
427 ALU2(MUL)
428 ALU1(NOT)
429 ALU2(OR)
430 ALU2(PLN)
431 ALU1(RNDD)
432 ALU1(RNDE)
433 ALU1(RNDU)
434 ALU1(RNDZ)
435 ALU2(SAD2)
436 ALU2_ACC(SADA2)
437 ALU2(SEL)
438 ALU2(SHL)
439 ALU2(SHR)
440 ALU2_ACC(SUBB)
441 ALU2(XOR)
442
443 #undef ALU3
444 #undef ALU2_ACC
445 #undef ALU2
446 #undef ALU1
447 /** @} */
448
449 /**
450 * CMP: Sets the low bit of the destination channels with the result
451 * of the comparison, while the upper bits are undefined, and updates
452 * the flag register with the packed 16 bits of the result.
453 */
454 instruction *
455 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
456 brw_conditional_mod condition) const
457 {
458 /* Take the instruction:
459 *
460 * CMP null<d> src0<f> src1<f>
461 *
462 * Original gen4 does type conversion to the destination type
463 * before comparison, producing garbage results for floating
464 * point comparisons.
465 *
466 * The destination type doesn't matter on newer generations,
467 * so we set the type to match src0 so we can compact the
468 * instruction.
469 */
470 return set_condmod(condition,
471 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
472 fix_unsigned_negate(src0),
473 fix_unsigned_negate(src1)));
474 }
475
476 /**
477 * Gen4 predicated IF.
478 */
479 instruction *
480 IF(brw_predicate predicate) const
481 {
482 return set_predicate(predicate, emit(BRW_OPCODE_IF));
483 }
484
485 /**
486 * Emit a linear interpolation instruction.
487 */
488 instruction *
489 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
490 const src_reg &a) const
491 {
492 if (shader->devinfo->gen >= 6) {
493 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
494 * we need to reorder the operands.
495 */
496 return emit(BRW_OPCODE_LRP, dst, a, y, x);
497
498 } else {
499 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
500 const dst_reg y_times_a = vgrf(dst.type);
501 const dst_reg one_minus_a = vgrf(dst.type);
502 const dst_reg x_times_one_minus_a = vgrf(dst.type);
503
504 MUL(y_times_a, y, a);
505 ADD(one_minus_a, negate(a), src_reg(1.0f));
506 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
507 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
508 }
509 }
510
511 /**
512 * Collect a number of registers in a contiguous range of registers.
513 */
514 instruction *
515 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
516 unsigned sources, unsigned header_size) const
517 {
518 assert(dst.width % 8 == 0);
519 instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
520 dst.width, dst, src, sources));
521 inst->header_size = header_size;
522
523 for (unsigned i = 0; i < header_size; i++)
524 assert(src[i].file != GRF ||
525 src[i].width * type_sz(src[i].type) == 32);
526 inst->regs_written = header_size;
527
528 for (unsigned i = header_size; i < sources; ++i)
529 assert(src[i].file != GRF ||
530 src[i].width == dst.width);
531 inst->regs_written += (sources - header_size) * (dst.width / 8);
532
533 return inst;
534 }
535
536 backend_shader *shader;
537
538 private:
539 /**
540 * Workaround for negation of UD registers. See comment in
541 * fs_generator::generate_code() for more details.
542 */
543 src_reg
544 fix_unsigned_negate(const src_reg &src) const
545 {
546 if (src.type == BRW_REGISTER_TYPE_UD &&
547 src.negate) {
548 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
549 MOV(temp, src);
550 return src_reg(temp);
551 } else {
552 return src;
553 }
554 }
555
556 /**
557 * Workaround for source register modes not supported by the ternary
558 * instruction encoding.
559 */
560 src_reg
561 fix_3src_operand(const src_reg &src) const
562 {
563 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
564 return src;
565 } else {
566 dst_reg expanded = vgrf(src.type);
567 MOV(expanded, src);
568 return expanded;
569 }
570 }
571
572 /**
573 * Workaround for source register modes not supported by the math
574 * instruction.
575 */
576 src_reg
577 fix_math_operand(const src_reg &src) const
578 {
579 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
580 * might be able to do better by doing execsize = 1 math and then
581 * expanding that result out, but we would need to be careful with
582 * masking.
583 *
584 * Gen6 hardware ignores source modifiers (negate and abs) on math
585 * instructions, so we also move to a temp to set those up.
586 *
587 * Gen7 relaxes most of the above restrictions, but still can't use IMM
588 * operands to math
589 */
590 if ((shader->devinfo->gen == 6 &&
591 (src.file == IMM || src.file == UNIFORM ||
592 src.abs || src.negate)) ||
593 (shader->devinfo->gen == 7 && src.file == IMM)) {
594 const dst_reg tmp = vgrf(src.type);
595 MOV(tmp, src);
596 return tmp;
597 } else {
598 return src;
599 }
600 }
601
602 /**
603 * Workaround other weirdness of the math instruction.
604 */
605 instruction *
606 fix_math_instruction(instruction *inst) const
607 {
608 if (shader->devinfo->gen < 6) {
609 inst->base_mrf = 2;
610 inst->mlen = inst->sources * dispatch_width() / 8;
611
612 if (inst->sources > 1) {
613 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
614 * "Message Payload":
615 *
616 * "Operand0[7]. For the INT DIV functions, this operand is the
617 * denominator."
618 * ...
619 * "Operand1[7]. For the INT DIV functions, this operand is the
620 * numerator."
621 */
622 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
623 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
624 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
625
626 inst->resize_sources(1);
627 inst->src[0] = src0;
628
629 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
630 dispatch_width()), src1);
631 }
632 }
633
634 return inst;
635 }
636
637 bblock_t *block;
638 exec_node *cursor;
639
640 unsigned _dispatch_width;
641 unsigned _group;
642 bool force_writemask_all;
643
644 /** Debug annotation info. */
645 struct {
646 const char *str;
647 const void *ir;
648 } annotation;
649 };
650 }
651
652 #endif