i965/fs: Lower SHADER_OPCODE_TXF_UMS/MCS_LOGICAL too on Gen7+.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions before \p cursor in
68 * basic block \p block, inheriting other code generation parameters
69 * from this.
70 */
71 fs_builder
72 at(bblock_t *block, exec_node *cursor) const
73 {
74 fs_builder bld = *this;
75 bld.block = block;
76 bld.cursor = cursor;
77 return bld;
78 }
79
80 /**
81 * Construct an fs_builder appending instructions at the end of the
82 * instruction list of the shader, inheriting other code generation
83 * parameters from this.
84 */
85 fs_builder
86 at_end() const
87 {
88 return at(NULL, (exec_node *)&shader->instructions.tail);
89 }
90
91 /**
92 * Construct a builder specifying the default SIMD width and group of
93 * channel enable signals, inheriting other code generation parameters
94 * from this.
95 *
96 * \p n gives the default SIMD width, \p i gives the slot group used for
97 * predication and control flow masking in multiples of \p n channels.
98 */
99 fs_builder
100 group(unsigned n, unsigned i) const
101 {
102 assert(force_writemask_all ||
103 (n <= dispatch_width() && i < dispatch_width() / n));
104 fs_builder bld = *this;
105 bld._dispatch_width = n;
106 bld._group += i * n;
107 return bld;
108 }
109
110 /**
111 * Alias for group() with width equal to eight.
112 */
113 fs_builder
114 half(unsigned i) const
115 {
116 return group(8, i);
117 }
118
119 /**
120 * Construct a builder with per-channel control flow execution masking
121 * disabled if \p b is true. If control flow execution masking is
122 * already disabled this has no effect.
123 */
124 fs_builder
125 exec_all(bool b = true) const
126 {
127 fs_builder bld = *this;
128 if (b)
129 bld.force_writemask_all = true;
130 return bld;
131 }
132
133 /**
134 * Construct a builder with the given debug annotation info.
135 */
136 fs_builder
137 annotate(const char *str, const void *ir = NULL) const
138 {
139 fs_builder bld = *this;
140 bld.annotation.str = str;
141 bld.annotation.ir = ir;
142 return bld;
143 }
144
145 /**
146 * Get the SIMD width in use.
147 */
148 unsigned
149 dispatch_width() const
150 {
151 return _dispatch_width;
152 }
153
154 /**
155 * Allocate a virtual register of natural vector size (one for this IR)
156 * and SIMD width. \p n gives the amount of space to allocate in
157 * dispatch_width units (which is just enough space for one logical
158 * component in this IR).
159 */
160 dst_reg
161 vgrf(enum brw_reg_type type, unsigned n = 1) const
162 {
163 return dst_reg(GRF, shader->alloc.allocate(
164 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
165 REG_SIZE)),
166 type);
167 }
168
169 /**
170 * Create a null register of floating type.
171 */
172 dst_reg
173 null_reg_f() const
174 {
175 return dst_reg(retype(brw_null_vec(dispatch_width()),
176 BRW_REGISTER_TYPE_F));
177 }
178
179 /**
180 * Create a null register of signed integer type.
181 */
182 dst_reg
183 null_reg_d() const
184 {
185 return dst_reg(retype(brw_null_vec(dispatch_width()),
186 BRW_REGISTER_TYPE_D));
187 }
188
189 /**
190 * Create a null register of unsigned integer type.
191 */
192 dst_reg
193 null_reg_ud() const
194 {
195 return dst_reg(retype(brw_null_vec(dispatch_width()),
196 BRW_REGISTER_TYPE_UD));
197 }
198
199 /**
200 * Get the mask of SIMD channels enabled by dispatch and not yet
201 * disabled by discard.
202 */
203 src_reg
204 sample_mask_reg() const
205 {
206 const bool uses_kill =
207 (shader->stage == MESA_SHADER_FRAGMENT &&
208 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
209 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
210 uses_kill ? brw_flag_reg(0, 1) :
211 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
212 }
213
214 /**
215 * Insert an instruction into the program.
216 */
217 instruction *
218 emit(const instruction &inst) const
219 {
220 return emit(new(shader->mem_ctx) instruction(inst));
221 }
222
223 /**
224 * Create and insert a nullary control instruction into the program.
225 */
226 instruction *
227 emit(enum opcode opcode) const
228 {
229 return emit(instruction(opcode, dispatch_width()));
230 }
231
232 /**
233 * Create and insert a nullary instruction into the program.
234 */
235 instruction *
236 emit(enum opcode opcode, const dst_reg &dst) const
237 {
238 return emit(instruction(opcode, dispatch_width(), dst));
239 }
240
241 /**
242 * Create and insert a unary instruction into the program.
243 */
244 instruction *
245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246 {
247 switch (opcode) {
248 case SHADER_OPCODE_RCP:
249 case SHADER_OPCODE_RSQ:
250 case SHADER_OPCODE_SQRT:
251 case SHADER_OPCODE_EXP2:
252 case SHADER_OPCODE_LOG2:
253 case SHADER_OPCODE_SIN:
254 case SHADER_OPCODE_COS:
255 return fix_math_instruction(
256 emit(instruction(opcode, dispatch_width(), dst,
257 fix_math_operand(src0))));
258
259 default:
260 return emit(instruction(opcode, dispatch_width(), dst, src0));
261 }
262 }
263
264 /**
265 * Create and insert a binary instruction into the program.
266 */
267 instruction *
268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269 const src_reg &src1) const
270 {
271 switch (opcode) {
272 case SHADER_OPCODE_POW:
273 case SHADER_OPCODE_INT_QUOTIENT:
274 case SHADER_OPCODE_INT_REMAINDER:
275 return fix_math_instruction(
276 emit(instruction(opcode, dispatch_width(), dst,
277 fix_math_operand(src0),
278 fix_math_operand(src1))));
279
280 default:
281 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
282
283 }
284 }
285
286 /**
287 * Create and insert a ternary instruction into the program.
288 */
289 instruction *
290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291 const src_reg &src1, const src_reg &src2) const
292 {
293 switch (opcode) {
294 case BRW_OPCODE_BFE:
295 case BRW_OPCODE_BFI2:
296 case BRW_OPCODE_MAD:
297 case BRW_OPCODE_LRP:
298 return emit(instruction(opcode, dispatch_width(), dst,
299 fix_3src_operand(src0),
300 fix_3src_operand(src1),
301 fix_3src_operand(src2)));
302
303 default:
304 return emit(instruction(opcode, dispatch_width(), dst,
305 src0, src1, src2));
306 }
307 }
308
309 /**
310 * Create and insert an instruction with a variable number of sources
311 * into the program.
312 */
313 instruction *
314 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
315 unsigned n) const
316 {
317 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
318 }
319
320 /**
321 * Insert a preallocated instruction into the program.
322 */
323 instruction *
324 emit(instruction *inst) const
325 {
326 assert(inst->exec_size == dispatch_width() ||
327 force_writemask_all);
328 assert(_group == 0 || _group == 8);
329
330 inst->force_sechalf = (_group == 8);
331 inst->force_writemask_all = force_writemask_all;
332 inst->annotation = annotation.str;
333 inst->ir = annotation.ir;
334
335 if (block)
336 static_cast<instruction *>(cursor)->insert_before(block, inst);
337 else
338 cursor->insert_before(inst);
339
340 return inst;
341 }
342
343 /**
344 * Select \p src0 if the comparison of both sources with the given
345 * conditional mod evaluates to true, otherwise select \p src1.
346 *
347 * Generally useful to get the minimum or maximum of two values.
348 */
349 void
350 emit_minmax(const dst_reg &dst, const src_reg &src0,
351 const src_reg &src1, brw_conditional_mod mod) const
352 {
353 if (shader->devinfo->gen >= 6) {
354 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
355 fix_unsigned_negate(src1)));
356 } else {
357 CMP(null_reg_d(), src0, src1, mod);
358 set_predicate(BRW_PREDICATE_NORMAL,
359 SEL(dst, src0, src1));
360 }
361 }
362
363 /**
364 * Copy any live channel from \p src to the first channel of the result.
365 */
366 src_reg
367 emit_uniformize(const src_reg &src) const
368 {
369 const fs_builder ubld = exec_all();
370 const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
371 const dst_reg dst = component(vgrf(src.type), 0);
372
373 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
374 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
375
376 return src_reg(dst);
377 }
378
379 /**
380 * Assorted arithmetic ops.
381 * @{
382 */
383 #define ALU1(op) \
384 instruction * \
385 op(const dst_reg &dst, const src_reg &src0) const \
386 { \
387 return emit(BRW_OPCODE_##op, dst, src0); \
388 }
389
390 #define ALU2(op) \
391 instruction * \
392 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
393 { \
394 return emit(BRW_OPCODE_##op, dst, src0, src1); \
395 }
396
397 #define ALU2_ACC(op) \
398 instruction * \
399 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
400 { \
401 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
402 inst->writes_accumulator = true; \
403 return inst; \
404 }
405
406 #define ALU3(op) \
407 instruction * \
408 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
409 const src_reg &src2) const \
410 { \
411 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
412 }
413
414 ALU2(ADD)
415 ALU2_ACC(ADDC)
416 ALU2(AND)
417 ALU2(ASR)
418 ALU2(AVG)
419 ALU3(BFE)
420 ALU2(BFI1)
421 ALU3(BFI2)
422 ALU1(BFREV)
423 ALU1(CBIT)
424 ALU2(CMPN)
425 ALU3(CSEL)
426 ALU2(DP2)
427 ALU2(DP3)
428 ALU2(DP4)
429 ALU2(DPH)
430 ALU1(F16TO32)
431 ALU1(F32TO16)
432 ALU1(FBH)
433 ALU1(FBL)
434 ALU1(FRC)
435 ALU2(LINE)
436 ALU1(LZD)
437 ALU2(MAC)
438 ALU2_ACC(MACH)
439 ALU3(MAD)
440 ALU1(MOV)
441 ALU2(MUL)
442 ALU1(NOT)
443 ALU2(OR)
444 ALU2(PLN)
445 ALU1(RNDD)
446 ALU1(RNDE)
447 ALU1(RNDU)
448 ALU1(RNDZ)
449 ALU2(SAD2)
450 ALU2_ACC(SADA2)
451 ALU2(SEL)
452 ALU2(SHL)
453 ALU2(SHR)
454 ALU2_ACC(SUBB)
455 ALU2(XOR)
456
457 #undef ALU3
458 #undef ALU2_ACC
459 #undef ALU2
460 #undef ALU1
461 /** @} */
462
463 /**
464 * CMP: Sets the low bit of the destination channels with the result
465 * of the comparison, while the upper bits are undefined, and updates
466 * the flag register with the packed 16 bits of the result.
467 */
468 instruction *
469 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
470 brw_conditional_mod condition) const
471 {
472 /* Take the instruction:
473 *
474 * CMP null<d> src0<f> src1<f>
475 *
476 * Original gen4 does type conversion to the destination type
477 * before comparison, producing garbage results for floating
478 * point comparisons.
479 *
480 * The destination type doesn't matter on newer generations,
481 * so we set the type to match src0 so we can compact the
482 * instruction.
483 */
484 return set_condmod(condition,
485 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
486 fix_unsigned_negate(src0),
487 fix_unsigned_negate(src1)));
488 }
489
490 /**
491 * Gen4 predicated IF.
492 */
493 instruction *
494 IF(brw_predicate predicate) const
495 {
496 return set_predicate(predicate, emit(BRW_OPCODE_IF));
497 }
498
499 /**
500 * Emit a linear interpolation instruction.
501 */
502 instruction *
503 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
504 const src_reg &a) const
505 {
506 if (shader->devinfo->gen >= 6) {
507 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
508 * we need to reorder the operands.
509 */
510 return emit(BRW_OPCODE_LRP, dst, a, y, x);
511
512 } else {
513 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
514 const dst_reg y_times_a = vgrf(dst.type);
515 const dst_reg one_minus_a = vgrf(dst.type);
516 const dst_reg x_times_one_minus_a = vgrf(dst.type);
517
518 MUL(y_times_a, y, a);
519 ADD(one_minus_a, negate(a), src_reg(1.0f));
520 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
521 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
522 }
523 }
524
525 /**
526 * Collect a number of registers in a contiguous range of registers.
527 */
528 instruction *
529 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
530 unsigned sources, unsigned header_size) const
531 {
532 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
533 inst->header_size = header_size;
534 inst->regs_written = header_size +
535 (sources - header_size) * (dispatch_width() / 8);
536
537 return inst;
538 }
539
540 backend_shader *shader;
541
542 private:
543 /**
544 * Workaround for negation of UD registers. See comment in
545 * fs_generator::generate_code() for more details.
546 */
547 src_reg
548 fix_unsigned_negate(const src_reg &src) const
549 {
550 if (src.type == BRW_REGISTER_TYPE_UD &&
551 src.negate) {
552 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
553 MOV(temp, src);
554 return src_reg(temp);
555 } else {
556 return src;
557 }
558 }
559
560 /**
561 * Workaround for source register modes not supported by the ternary
562 * instruction encoding.
563 */
564 src_reg
565 fix_3src_operand(const src_reg &src) const
566 {
567 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
568 return src;
569 } else {
570 dst_reg expanded = vgrf(src.type);
571 MOV(expanded, src);
572 return expanded;
573 }
574 }
575
576 /**
577 * Workaround for source register modes not supported by the math
578 * instruction.
579 */
580 src_reg
581 fix_math_operand(const src_reg &src) const
582 {
583 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
584 * might be able to do better by doing execsize = 1 math and then
585 * expanding that result out, but we would need to be careful with
586 * masking.
587 *
588 * Gen6 hardware ignores source modifiers (negate and abs) on math
589 * instructions, so we also move to a temp to set those up.
590 *
591 * Gen7 relaxes most of the above restrictions, but still can't use IMM
592 * operands to math
593 */
594 if ((shader->devinfo->gen == 6 &&
595 (src.file == IMM || src.file == UNIFORM ||
596 src.abs || src.negate)) ||
597 (shader->devinfo->gen == 7 && src.file == IMM)) {
598 const dst_reg tmp = vgrf(src.type);
599 MOV(tmp, src);
600 return tmp;
601 } else {
602 return src;
603 }
604 }
605
606 /**
607 * Workaround other weirdness of the math instruction.
608 */
609 instruction *
610 fix_math_instruction(instruction *inst) const
611 {
612 if (shader->devinfo->gen < 6) {
613 inst->base_mrf = 2;
614 inst->mlen = inst->sources * dispatch_width() / 8;
615
616 if (inst->sources > 1) {
617 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
618 * "Message Payload":
619 *
620 * "Operand0[7]. For the INT DIV functions, this operand is the
621 * denominator."
622 * ...
623 * "Operand1[7]. For the INT DIV functions, this operand is the
624 * numerator."
625 */
626 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
627 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
628 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
629
630 inst->resize_sources(1);
631 inst->src[0] = src0;
632
633 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
634 src1);
635 }
636 }
637
638 return inst;
639 }
640
641 bblock_t *block;
642 exec_node *cursor;
643
644 unsigned _dispatch_width;
645 unsigned _group;
646 bool force_writemask_all;
647
648 /** Debug annotation info. */
649 struct {
650 const char *str;
651 const void *ir;
652 } annotation;
653 };
654 }
655
656 #endif