i965: Make emit_minmax return an instruction*.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions into \p shader
68 * before instruction \p inst in basic block \p block. The default
69 * execution controls and debug annotation are initialized from the
70 * instruction passed as argument.
71 */
72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73 shader(shader), block(block), cursor(inst),
74 _dispatch_width(inst->exec_size),
75 _group(inst->force_sechalf ? 8 : 0),
76 force_writemask_all(inst->force_writemask_all)
77 {
78 annotation.str = inst->annotation;
79 annotation.ir = inst->ir;
80 }
81
82 /**
83 * Construct an fs_builder that inserts instructions before \p cursor in
84 * basic block \p block, inheriting other code generation parameters
85 * from this.
86 */
87 fs_builder
88 at(bblock_t *block, exec_node *cursor) const
89 {
90 fs_builder bld = *this;
91 bld.block = block;
92 bld.cursor = cursor;
93 return bld;
94 }
95
96 /**
97 * Construct an fs_builder appending instructions at the end of the
98 * instruction list of the shader, inheriting other code generation
99 * parameters from this.
100 */
101 fs_builder
102 at_end() const
103 {
104 return at(NULL, (exec_node *)&shader->instructions.tail);
105 }
106
107 /**
108 * Construct a builder specifying the default SIMD width and group of
109 * channel enable signals, inheriting other code generation parameters
110 * from this.
111 *
112 * \p n gives the default SIMD width, \p i gives the slot group used for
113 * predication and control flow masking in multiples of \p n channels.
114 */
115 fs_builder
116 group(unsigned n, unsigned i) const
117 {
118 assert(force_writemask_all ||
119 (n <= dispatch_width() && i < dispatch_width() / n));
120 fs_builder bld = *this;
121 bld._dispatch_width = n;
122 bld._group += i * n;
123 return bld;
124 }
125
126 /**
127 * Alias for group() with width equal to eight.
128 */
129 fs_builder
130 half(unsigned i) const
131 {
132 return group(8, i);
133 }
134
135 /**
136 * Construct a builder with per-channel control flow execution masking
137 * disabled if \p b is true. If control flow execution masking is
138 * already disabled this has no effect.
139 */
140 fs_builder
141 exec_all(bool b = true) const
142 {
143 fs_builder bld = *this;
144 if (b)
145 bld.force_writemask_all = true;
146 return bld;
147 }
148
149 /**
150 * Construct a builder with the given debug annotation info.
151 */
152 fs_builder
153 annotate(const char *str, const void *ir = NULL) const
154 {
155 fs_builder bld = *this;
156 bld.annotation.str = str;
157 bld.annotation.ir = ir;
158 return bld;
159 }
160
161 /**
162 * Get the SIMD width in use.
163 */
164 unsigned
165 dispatch_width() const
166 {
167 return _dispatch_width;
168 }
169
170 /**
171 * Allocate a virtual register of natural vector size (one for this IR)
172 * and SIMD width. \p n gives the amount of space to allocate in
173 * dispatch_width units (which is just enough space for one logical
174 * component in this IR).
175 */
176 dst_reg
177 vgrf(enum brw_reg_type type, unsigned n = 1) const
178 {
179 assert(dispatch_width() <= 32);
180
181 if (n > 0)
182 return dst_reg(VGRF, shader->alloc.allocate(
183 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
184 REG_SIZE)),
185 type);
186 else
187 return retype(null_reg_ud(), type);
188 }
189
190 /**
191 * Create a null register of floating type.
192 */
193 dst_reg
194 null_reg_f() const
195 {
196 return dst_reg(retype(brw_null_vec(dispatch_width()),
197 BRW_REGISTER_TYPE_F));
198 }
199
200 /**
201 * Create a null register of signed integer type.
202 */
203 dst_reg
204 null_reg_d() const
205 {
206 return dst_reg(retype(brw_null_vec(dispatch_width()),
207 BRW_REGISTER_TYPE_D));
208 }
209
210 /**
211 * Create a null register of unsigned integer type.
212 */
213 dst_reg
214 null_reg_ud() const
215 {
216 return dst_reg(retype(brw_null_vec(dispatch_width()),
217 BRW_REGISTER_TYPE_UD));
218 }
219
220 /**
221 * Get the mask of SIMD channels enabled by dispatch and not yet
222 * disabled by discard.
223 */
224 src_reg
225 sample_mask_reg() const
226 {
227 if (shader->stage != MESA_SHADER_FRAGMENT) {
228 return brw_imm_d(0xffff);
229 } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
230 return brw_flag_reg(0, 1);
231 } else {
232 return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
233 }
234 }
235
236 /**
237 * Insert an instruction into the program.
238 */
239 instruction *
240 emit(const instruction &inst) const
241 {
242 return emit(new(shader->mem_ctx) instruction(inst));
243 }
244
245 /**
246 * Create and insert a nullary control instruction into the program.
247 */
248 instruction *
249 emit(enum opcode opcode) const
250 {
251 return emit(instruction(opcode, dispatch_width()));
252 }
253
254 /**
255 * Create and insert a nullary instruction into the program.
256 */
257 instruction *
258 emit(enum opcode opcode, const dst_reg &dst) const
259 {
260 return emit(instruction(opcode, dispatch_width(), dst));
261 }
262
263 /**
264 * Create and insert a unary instruction into the program.
265 */
266 instruction *
267 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
268 {
269 switch (opcode) {
270 case SHADER_OPCODE_RCP:
271 case SHADER_OPCODE_RSQ:
272 case SHADER_OPCODE_SQRT:
273 case SHADER_OPCODE_EXP2:
274 case SHADER_OPCODE_LOG2:
275 case SHADER_OPCODE_SIN:
276 case SHADER_OPCODE_COS:
277 return fix_math_instruction(
278 emit(instruction(opcode, dispatch_width(), dst,
279 fix_math_operand(src0))));
280
281 default:
282 return emit(instruction(opcode, dispatch_width(), dst, src0));
283 }
284 }
285
286 /**
287 * Create and insert a binary instruction into the program.
288 */
289 instruction *
290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291 const src_reg &src1) const
292 {
293 switch (opcode) {
294 case SHADER_OPCODE_POW:
295 case SHADER_OPCODE_INT_QUOTIENT:
296 case SHADER_OPCODE_INT_REMAINDER:
297 return fix_math_instruction(
298 emit(instruction(opcode, dispatch_width(), dst,
299 fix_math_operand(src0),
300 fix_math_operand(src1))));
301
302 default:
303 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
304
305 }
306 }
307
308 /**
309 * Create and insert a ternary instruction into the program.
310 */
311 instruction *
312 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
313 const src_reg &src1, const src_reg &src2) const
314 {
315 switch (opcode) {
316 case BRW_OPCODE_BFE:
317 case BRW_OPCODE_BFI2:
318 case BRW_OPCODE_MAD:
319 case BRW_OPCODE_LRP:
320 return emit(instruction(opcode, dispatch_width(), dst,
321 fix_3src_operand(src0),
322 fix_3src_operand(src1),
323 fix_3src_operand(src2)));
324
325 default:
326 return emit(instruction(opcode, dispatch_width(), dst,
327 src0, src1, src2));
328 }
329 }
330
331 /**
332 * Create and insert an instruction with a variable number of sources
333 * into the program.
334 */
335 instruction *
336 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
337 unsigned n) const
338 {
339 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
340 }
341
342 /**
343 * Insert a preallocated instruction into the program.
344 */
345 instruction *
346 emit(instruction *inst) const
347 {
348 assert(inst->exec_size <= 32);
349 assert(inst->exec_size == dispatch_width() ||
350 force_writemask_all);
351 assert(_group == 0 || _group == 8);
352
353 inst->force_sechalf = (_group == 8);
354 inst->force_writemask_all = force_writemask_all;
355 inst->annotation = annotation.str;
356 inst->ir = annotation.ir;
357
358 if (block)
359 static_cast<instruction *>(cursor)->insert_before(block, inst);
360 else
361 cursor->insert_before(inst);
362
363 return inst;
364 }
365
366 /**
367 * Select \p src0 if the comparison of both sources with the given
368 * conditional mod evaluates to true, otherwise select \p src1.
369 *
370 * Generally useful to get the minimum or maximum of two values.
371 */
372 instruction *
373 emit_minmax(const dst_reg &dst, const src_reg &src0,
374 const src_reg &src1, brw_conditional_mod mod) const
375 {
376 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
377
378 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
379 fix_unsigned_negate(src1)));
380 }
381
382 /**
383 * Copy any live channel from \p src to the first channel of the result.
384 */
385 src_reg
386 emit_uniformize(const src_reg &src) const
387 {
388 /* FIXME: We use a vector chan_index and dst to allow constant and
389 * copy propagration to move result all the way into the consuming
390 * instruction (typically a surface index or sampler index for a
391 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
392 * dispatch. Once we teach const/copy propagation about scalars we
393 * should go back to scalar destinations here.
394 */
395 const fs_builder ubld = exec_all();
396 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
397 const dst_reg dst = vgrf(src.type);
398
399 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
400 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
401
402 return src_reg(component(dst, 0));
403 }
404
405 /**
406 * Assorted arithmetic ops.
407 * @{
408 */
409 #define ALU1(op) \
410 instruction * \
411 op(const dst_reg &dst, const src_reg &src0) const \
412 { \
413 return emit(BRW_OPCODE_##op, dst, src0); \
414 }
415
416 #define ALU2(op) \
417 instruction * \
418 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
419 { \
420 return emit(BRW_OPCODE_##op, dst, src0, src1); \
421 }
422
423 #define ALU2_ACC(op) \
424 instruction * \
425 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
426 { \
427 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
428 inst->writes_accumulator = true; \
429 return inst; \
430 }
431
432 #define ALU3(op) \
433 instruction * \
434 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
435 const src_reg &src2) const \
436 { \
437 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
438 }
439
440 ALU2(ADD)
441 ALU2_ACC(ADDC)
442 ALU2(AND)
443 ALU2(ASR)
444 ALU2(AVG)
445 ALU3(BFE)
446 ALU2(BFI1)
447 ALU3(BFI2)
448 ALU1(BFREV)
449 ALU1(CBIT)
450 ALU2(CMPN)
451 ALU3(CSEL)
452 ALU2(DP2)
453 ALU2(DP3)
454 ALU2(DP4)
455 ALU2(DPH)
456 ALU1(F16TO32)
457 ALU1(F32TO16)
458 ALU1(FBH)
459 ALU1(FBL)
460 ALU1(FRC)
461 ALU2(LINE)
462 ALU1(LZD)
463 ALU2(MAC)
464 ALU2_ACC(MACH)
465 ALU3(MAD)
466 ALU1(MOV)
467 ALU2(MUL)
468 ALU1(NOT)
469 ALU2(OR)
470 ALU2(PLN)
471 ALU1(RNDD)
472 ALU1(RNDE)
473 ALU1(RNDU)
474 ALU1(RNDZ)
475 ALU2(SAD2)
476 ALU2_ACC(SADA2)
477 ALU2(SEL)
478 ALU2(SHL)
479 ALU2(SHR)
480 ALU2_ACC(SUBB)
481 ALU2(XOR)
482
483 #undef ALU3
484 #undef ALU2_ACC
485 #undef ALU2
486 #undef ALU1
487 /** @} */
488
489 /**
490 * CMP: Sets the low bit of the destination channels with the result
491 * of the comparison, while the upper bits are undefined, and updates
492 * the flag register with the packed 16 bits of the result.
493 */
494 instruction *
495 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
496 brw_conditional_mod condition) const
497 {
498 /* Take the instruction:
499 *
500 * CMP null<d> src0<f> src1<f>
501 *
502 * Original gen4 does type conversion to the destination type
503 * before comparison, producing garbage results for floating
504 * point comparisons.
505 *
506 * The destination type doesn't matter on newer generations,
507 * so we set the type to match src0 so we can compact the
508 * instruction.
509 */
510 return set_condmod(condition,
511 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
512 fix_unsigned_negate(src0),
513 fix_unsigned_negate(src1)));
514 }
515
516 /**
517 * Gen4 predicated IF.
518 */
519 instruction *
520 IF(brw_predicate predicate) const
521 {
522 return set_predicate(predicate, emit(BRW_OPCODE_IF));
523 }
524
525 /**
526 * Emit a linear interpolation instruction.
527 */
528 instruction *
529 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
530 const src_reg &a) const
531 {
532 if (shader->devinfo->gen >= 6) {
533 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
534 * we need to reorder the operands.
535 */
536 return emit(BRW_OPCODE_LRP, dst, a, y, x);
537
538 } else {
539 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
540 const dst_reg y_times_a = vgrf(dst.type);
541 const dst_reg one_minus_a = vgrf(dst.type);
542 const dst_reg x_times_one_minus_a = vgrf(dst.type);
543
544 MUL(y_times_a, y, a);
545 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
546 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
547 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
548 }
549 }
550
551 /**
552 * Collect a number of registers in a contiguous range of registers.
553 */
554 instruction *
555 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
556 unsigned sources, unsigned header_size) const
557 {
558 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
559 inst->header_size = header_size;
560 inst->regs_written = header_size +
561 (sources - header_size) * (dispatch_width() / 8);
562
563 return inst;
564 }
565
566 backend_shader *shader;
567
568 private:
569 /**
570 * Workaround for negation of UD registers. See comment in
571 * fs_generator::generate_code() for more details.
572 */
573 src_reg
574 fix_unsigned_negate(const src_reg &src) const
575 {
576 if (src.type == BRW_REGISTER_TYPE_UD &&
577 src.negate) {
578 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
579 MOV(temp, src);
580 return src_reg(temp);
581 } else {
582 return src;
583 }
584 }
585
586 /**
587 * Workaround for source register modes not supported by the ternary
588 * instruction encoding.
589 */
590 src_reg
591 fix_3src_operand(const src_reg &src) const
592 {
593 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
594 return src;
595 } else {
596 dst_reg expanded = vgrf(src.type);
597 MOV(expanded, src);
598 return expanded;
599 }
600 }
601
602 /**
603 * Workaround for source register modes not supported by the math
604 * instruction.
605 */
606 src_reg
607 fix_math_operand(const src_reg &src) const
608 {
609 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
610 * might be able to do better by doing execsize = 1 math and then
611 * expanding that result out, but we would need to be careful with
612 * masking.
613 *
614 * Gen6 hardware ignores source modifiers (negate and abs) on math
615 * instructions, so we also move to a temp to set those up.
616 *
617 * Gen7 relaxes most of the above restrictions, but still can't use IMM
618 * operands to math
619 */
620 if ((shader->devinfo->gen == 6 &&
621 (src.file == IMM || src.file == UNIFORM ||
622 src.abs || src.negate)) ||
623 (shader->devinfo->gen == 7 && src.file == IMM)) {
624 const dst_reg tmp = vgrf(src.type);
625 MOV(tmp, src);
626 return tmp;
627 } else {
628 return src;
629 }
630 }
631
632 /**
633 * Workaround other weirdness of the math instruction.
634 */
635 instruction *
636 fix_math_instruction(instruction *inst) const
637 {
638 if (shader->devinfo->gen < 6) {
639 inst->base_mrf = 2;
640 inst->mlen = inst->sources * dispatch_width() / 8;
641
642 if (inst->sources > 1) {
643 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
644 * "Message Payload":
645 *
646 * "Operand0[7]. For the INT DIV functions, this operand is the
647 * denominator."
648 * ...
649 * "Operand1[7]. For the INT DIV functions, this operand is the
650 * numerator."
651 */
652 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
653 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
654 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
655
656 inst->resize_sources(1);
657 inst->src[0] = src0;
658
659 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
660 src1);
661 }
662 }
663
664 return inst;
665 }
666
667 bblock_t *block;
668 exec_node *cursor;
669
670 unsigned _dispatch_width;
671 unsigned _group;
672 bool force_writemask_all;
673
674 /** Debug annotation info. */
675 struct {
676 const char *str;
677 const void *ir;
678 } annotation;
679 };
680 }
681
682 #endif