i965/fs: Switch opt_cse() to the fs_builder constructor from instruction.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions into \p shader
68 * before instruction \p inst in basic block \p block. The default
69 * execution controls and debug annotation are initialized from the
70 * instruction passed as argument.
71 */
72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73 shader(shader), block(block), cursor(inst),
74 _dispatch_width(inst->exec_size),
75 _group(inst->force_sechalf ? 8 : 0),
76 force_writemask_all(inst->force_writemask_all)
77 {
78 annotation.str = inst->annotation;
79 annotation.ir = inst->ir;
80 }
81
82 /**
83 * Construct an fs_builder that inserts instructions before \p cursor in
84 * basic block \p block, inheriting other code generation parameters
85 * from this.
86 */
87 fs_builder
88 at(bblock_t *block, exec_node *cursor) const
89 {
90 fs_builder bld = *this;
91 bld.block = block;
92 bld.cursor = cursor;
93 return bld;
94 }
95
96 /**
97 * Construct an fs_builder appending instructions at the end of the
98 * instruction list of the shader, inheriting other code generation
99 * parameters from this.
100 */
101 fs_builder
102 at_end() const
103 {
104 return at(NULL, (exec_node *)&shader->instructions.tail);
105 }
106
107 /**
108 * Construct a builder specifying the default SIMD width and group of
109 * channel enable signals, inheriting other code generation parameters
110 * from this.
111 *
112 * \p n gives the default SIMD width, \p i gives the slot group used for
113 * predication and control flow masking in multiples of \p n channels.
114 */
115 fs_builder
116 group(unsigned n, unsigned i) const
117 {
118 assert(force_writemask_all ||
119 (n <= dispatch_width() && i < dispatch_width() / n));
120 fs_builder bld = *this;
121 bld._dispatch_width = n;
122 bld._group += i * n;
123 return bld;
124 }
125
126 /**
127 * Alias for group() with width equal to eight.
128 */
129 fs_builder
130 half(unsigned i) const
131 {
132 return group(8, i);
133 }
134
135 /**
136 * Construct a builder with per-channel control flow execution masking
137 * disabled if \p b is true. If control flow execution masking is
138 * already disabled this has no effect.
139 */
140 fs_builder
141 exec_all(bool b = true) const
142 {
143 fs_builder bld = *this;
144 if (b)
145 bld.force_writemask_all = true;
146 return bld;
147 }
148
149 /**
150 * Construct a builder with the given debug annotation info.
151 */
152 fs_builder
153 annotate(const char *str, const void *ir = NULL) const
154 {
155 fs_builder bld = *this;
156 bld.annotation.str = str;
157 bld.annotation.ir = ir;
158 return bld;
159 }
160
161 /**
162 * Get the SIMD width in use.
163 */
164 unsigned
165 dispatch_width() const
166 {
167 return _dispatch_width;
168 }
169
170 /**
171 * Allocate a virtual register of natural vector size (one for this IR)
172 * and SIMD width. \p n gives the amount of space to allocate in
173 * dispatch_width units (which is just enough space for one logical
174 * component in this IR).
175 */
176 dst_reg
177 vgrf(enum brw_reg_type type, unsigned n = 1) const
178 {
179 if (n > 0)
180 return dst_reg(GRF, shader->alloc.allocate(
181 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
182 REG_SIZE)),
183 type);
184 else
185 return retype(null_reg_ud(), type);
186 }
187
188 /**
189 * Create a null register of floating type.
190 */
191 dst_reg
192 null_reg_f() const
193 {
194 return dst_reg(retype(brw_null_vec(dispatch_width()),
195 BRW_REGISTER_TYPE_F));
196 }
197
198 /**
199 * Create a null register of signed integer type.
200 */
201 dst_reg
202 null_reg_d() const
203 {
204 return dst_reg(retype(brw_null_vec(dispatch_width()),
205 BRW_REGISTER_TYPE_D));
206 }
207
208 /**
209 * Create a null register of unsigned integer type.
210 */
211 dst_reg
212 null_reg_ud() const
213 {
214 return dst_reg(retype(brw_null_vec(dispatch_width()),
215 BRW_REGISTER_TYPE_UD));
216 }
217
218 /**
219 * Get the mask of SIMD channels enabled by dispatch and not yet
220 * disabled by discard.
221 */
222 src_reg
223 sample_mask_reg() const
224 {
225 const bool uses_kill =
226 (shader->stage == MESA_SHADER_FRAGMENT &&
227 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
228 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
229 uses_kill ? brw_flag_reg(0, 1) :
230 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
231 }
232
233 /**
234 * Insert an instruction into the program.
235 */
236 instruction *
237 emit(const instruction &inst) const
238 {
239 return emit(new(shader->mem_ctx) instruction(inst));
240 }
241
242 /**
243 * Create and insert a nullary control instruction into the program.
244 */
245 instruction *
246 emit(enum opcode opcode) const
247 {
248 return emit(instruction(opcode, dispatch_width()));
249 }
250
251 /**
252 * Create and insert a nullary instruction into the program.
253 */
254 instruction *
255 emit(enum opcode opcode, const dst_reg &dst) const
256 {
257 return emit(instruction(opcode, dispatch_width(), dst));
258 }
259
260 /**
261 * Create and insert a unary instruction into the program.
262 */
263 instruction *
264 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
265 {
266 switch (opcode) {
267 case SHADER_OPCODE_RCP:
268 case SHADER_OPCODE_RSQ:
269 case SHADER_OPCODE_SQRT:
270 case SHADER_OPCODE_EXP2:
271 case SHADER_OPCODE_LOG2:
272 case SHADER_OPCODE_SIN:
273 case SHADER_OPCODE_COS:
274 return fix_math_instruction(
275 emit(instruction(opcode, dispatch_width(), dst,
276 fix_math_operand(src0))));
277
278 default:
279 return emit(instruction(opcode, dispatch_width(), dst, src0));
280 }
281 }
282
283 /**
284 * Create and insert a binary instruction into the program.
285 */
286 instruction *
287 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
288 const src_reg &src1) const
289 {
290 switch (opcode) {
291 case SHADER_OPCODE_POW:
292 case SHADER_OPCODE_INT_QUOTIENT:
293 case SHADER_OPCODE_INT_REMAINDER:
294 return fix_math_instruction(
295 emit(instruction(opcode, dispatch_width(), dst,
296 fix_math_operand(src0),
297 fix_math_operand(src1))));
298
299 default:
300 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
301
302 }
303 }
304
305 /**
306 * Create and insert a ternary instruction into the program.
307 */
308 instruction *
309 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
310 const src_reg &src1, const src_reg &src2) const
311 {
312 switch (opcode) {
313 case BRW_OPCODE_BFE:
314 case BRW_OPCODE_BFI2:
315 case BRW_OPCODE_MAD:
316 case BRW_OPCODE_LRP:
317 return emit(instruction(opcode, dispatch_width(), dst,
318 fix_3src_operand(src0),
319 fix_3src_operand(src1),
320 fix_3src_operand(src2)));
321
322 default:
323 return emit(instruction(opcode, dispatch_width(), dst,
324 src0, src1, src2));
325 }
326 }
327
328 /**
329 * Create and insert an instruction with a variable number of sources
330 * into the program.
331 */
332 instruction *
333 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
334 unsigned n) const
335 {
336 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
337 }
338
339 /**
340 * Insert a preallocated instruction into the program.
341 */
342 instruction *
343 emit(instruction *inst) const
344 {
345 assert(inst->exec_size == dispatch_width() ||
346 force_writemask_all);
347 assert(_group == 0 || _group == 8);
348
349 inst->force_sechalf = (_group == 8);
350 inst->force_writemask_all = force_writemask_all;
351 inst->annotation = annotation.str;
352 inst->ir = annotation.ir;
353
354 if (block)
355 static_cast<instruction *>(cursor)->insert_before(block, inst);
356 else
357 cursor->insert_before(inst);
358
359 return inst;
360 }
361
362 /**
363 * Select \p src0 if the comparison of both sources with the given
364 * conditional mod evaluates to true, otherwise select \p src1.
365 *
366 * Generally useful to get the minimum or maximum of two values.
367 */
368 void
369 emit_minmax(const dst_reg &dst, const src_reg &src0,
370 const src_reg &src1, brw_conditional_mod mod) const
371 {
372 if (shader->devinfo->gen >= 6) {
373 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
374 fix_unsigned_negate(src1)));
375 } else {
376 CMP(null_reg_d(), src0, src1, mod);
377 set_predicate(BRW_PREDICATE_NORMAL,
378 SEL(dst, src0, src1));
379 }
380 }
381
382 /**
383 * Copy any live channel from \p src to the first channel of the result.
384 */
385 src_reg
386 emit_uniformize(const src_reg &src) const
387 {
388 const fs_builder ubld = exec_all();
389 const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
390 const dst_reg dst = component(vgrf(src.type), 0);
391
392 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
393 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
394
395 return src_reg(dst);
396 }
397
398 /**
399 * Assorted arithmetic ops.
400 * @{
401 */
402 #define ALU1(op) \
403 instruction * \
404 op(const dst_reg &dst, const src_reg &src0) const \
405 { \
406 return emit(BRW_OPCODE_##op, dst, src0); \
407 }
408
409 #define ALU2(op) \
410 instruction * \
411 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
412 { \
413 return emit(BRW_OPCODE_##op, dst, src0, src1); \
414 }
415
416 #define ALU2_ACC(op) \
417 instruction * \
418 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
419 { \
420 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
421 inst->writes_accumulator = true; \
422 return inst; \
423 }
424
425 #define ALU3(op) \
426 instruction * \
427 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
428 const src_reg &src2) const \
429 { \
430 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
431 }
432
433 ALU2(ADD)
434 ALU2_ACC(ADDC)
435 ALU2(AND)
436 ALU2(ASR)
437 ALU2(AVG)
438 ALU3(BFE)
439 ALU2(BFI1)
440 ALU3(BFI2)
441 ALU1(BFREV)
442 ALU1(CBIT)
443 ALU2(CMPN)
444 ALU3(CSEL)
445 ALU2(DP2)
446 ALU2(DP3)
447 ALU2(DP4)
448 ALU2(DPH)
449 ALU1(F16TO32)
450 ALU1(F32TO16)
451 ALU1(FBH)
452 ALU1(FBL)
453 ALU1(FRC)
454 ALU2(LINE)
455 ALU1(LZD)
456 ALU2(MAC)
457 ALU2_ACC(MACH)
458 ALU3(MAD)
459 ALU1(MOV)
460 ALU2(MUL)
461 ALU1(NOT)
462 ALU2(OR)
463 ALU2(PLN)
464 ALU1(RNDD)
465 ALU1(RNDE)
466 ALU1(RNDU)
467 ALU1(RNDZ)
468 ALU2(SAD2)
469 ALU2_ACC(SADA2)
470 ALU2(SEL)
471 ALU2(SHL)
472 ALU2(SHR)
473 ALU2_ACC(SUBB)
474 ALU2(XOR)
475
476 #undef ALU3
477 #undef ALU2_ACC
478 #undef ALU2
479 #undef ALU1
480 /** @} */
481
482 /**
483 * CMP: Sets the low bit of the destination channels with the result
484 * of the comparison, while the upper bits are undefined, and updates
485 * the flag register with the packed 16 bits of the result.
486 */
487 instruction *
488 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
489 brw_conditional_mod condition) const
490 {
491 /* Take the instruction:
492 *
493 * CMP null<d> src0<f> src1<f>
494 *
495 * Original gen4 does type conversion to the destination type
496 * before comparison, producing garbage results for floating
497 * point comparisons.
498 *
499 * The destination type doesn't matter on newer generations,
500 * so we set the type to match src0 so we can compact the
501 * instruction.
502 */
503 return set_condmod(condition,
504 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
505 fix_unsigned_negate(src0),
506 fix_unsigned_negate(src1)));
507 }
508
509 /**
510 * Gen4 predicated IF.
511 */
512 instruction *
513 IF(brw_predicate predicate) const
514 {
515 return set_predicate(predicate, emit(BRW_OPCODE_IF));
516 }
517
518 /**
519 * Emit a linear interpolation instruction.
520 */
521 instruction *
522 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
523 const src_reg &a) const
524 {
525 if (shader->devinfo->gen >= 6) {
526 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
527 * we need to reorder the operands.
528 */
529 return emit(BRW_OPCODE_LRP, dst, a, y, x);
530
531 } else {
532 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
533 const dst_reg y_times_a = vgrf(dst.type);
534 const dst_reg one_minus_a = vgrf(dst.type);
535 const dst_reg x_times_one_minus_a = vgrf(dst.type);
536
537 MUL(y_times_a, y, a);
538 ADD(one_minus_a, negate(a), src_reg(1.0f));
539 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
540 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
541 }
542 }
543
544 /**
545 * Collect a number of registers in a contiguous range of registers.
546 */
547 instruction *
548 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
549 unsigned sources, unsigned header_size) const
550 {
551 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
552 inst->header_size = header_size;
553 inst->regs_written = header_size +
554 (sources - header_size) * (dispatch_width() / 8);
555
556 return inst;
557 }
558
559 backend_shader *shader;
560
561 private:
562 /**
563 * Workaround for negation of UD registers. See comment in
564 * fs_generator::generate_code() for more details.
565 */
566 src_reg
567 fix_unsigned_negate(const src_reg &src) const
568 {
569 if (src.type == BRW_REGISTER_TYPE_UD &&
570 src.negate) {
571 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
572 MOV(temp, src);
573 return src_reg(temp);
574 } else {
575 return src;
576 }
577 }
578
579 /**
580 * Workaround for source register modes not supported by the ternary
581 * instruction encoding.
582 */
583 src_reg
584 fix_3src_operand(const src_reg &src) const
585 {
586 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
587 return src;
588 } else {
589 dst_reg expanded = vgrf(src.type);
590 MOV(expanded, src);
591 return expanded;
592 }
593 }
594
595 /**
596 * Workaround for source register modes not supported by the math
597 * instruction.
598 */
599 src_reg
600 fix_math_operand(const src_reg &src) const
601 {
602 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
603 * might be able to do better by doing execsize = 1 math and then
604 * expanding that result out, but we would need to be careful with
605 * masking.
606 *
607 * Gen6 hardware ignores source modifiers (negate and abs) on math
608 * instructions, so we also move to a temp to set those up.
609 *
610 * Gen7 relaxes most of the above restrictions, but still can't use IMM
611 * operands to math
612 */
613 if ((shader->devinfo->gen == 6 &&
614 (src.file == IMM || src.file == UNIFORM ||
615 src.abs || src.negate)) ||
616 (shader->devinfo->gen == 7 && src.file == IMM)) {
617 const dst_reg tmp = vgrf(src.type);
618 MOV(tmp, src);
619 return tmp;
620 } else {
621 return src;
622 }
623 }
624
625 /**
626 * Workaround other weirdness of the math instruction.
627 */
628 instruction *
629 fix_math_instruction(instruction *inst) const
630 {
631 if (shader->devinfo->gen < 6) {
632 inst->base_mrf = 2;
633 inst->mlen = inst->sources * dispatch_width() / 8;
634
635 if (inst->sources > 1) {
636 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
637 * "Message Payload":
638 *
639 * "Operand0[7]. For the INT DIV functions, this operand is the
640 * denominator."
641 * ...
642 * "Operand1[7]. For the INT DIV functions, this operand is the
643 * numerator."
644 */
645 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
646 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
647 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
648
649 inst->resize_sources(1);
650 inst->src[0] = src0;
651
652 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
653 src1);
654 }
655 }
656
657 return inst;
658 }
659
660 bblock_t *block;
661 exec_node *cursor;
662
663 unsigned _dispatch_width;
664 unsigned _group;
665 bool force_writemask_all;
666
667 /** Debug annotation info. */
668 struct {
669 const char *str;
670 const void *ir;
671 } annotation;
672 };
673 }
674
675 #endif