i965/nir/vec4: Implement load_const intrinsic
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions into \p shader
68 * before instruction \p inst in basic block \p block. The default
69 * execution controls and debug annotation are initialized from the
70 * instruction passed as argument.
71 */
72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73 shader(shader), block(block), cursor(inst),
74 _dispatch_width(inst->exec_size),
75 _group(inst->force_sechalf ? 8 : 0),
76 force_writemask_all(inst->force_writemask_all)
77 {
78 annotation.str = inst->annotation;
79 annotation.ir = inst->ir;
80 }
81
82 /**
83 * Construct an fs_builder that inserts instructions before \p cursor in
84 * basic block \p block, inheriting other code generation parameters
85 * from this.
86 */
87 fs_builder
88 at(bblock_t *block, exec_node *cursor) const
89 {
90 fs_builder bld = *this;
91 bld.block = block;
92 bld.cursor = cursor;
93 return bld;
94 }
95
96 /**
97 * Construct an fs_builder appending instructions at the end of the
98 * instruction list of the shader, inheriting other code generation
99 * parameters from this.
100 */
101 fs_builder
102 at_end() const
103 {
104 return at(NULL, (exec_node *)&shader->instructions.tail);
105 }
106
107 /**
108 * Construct a builder specifying the default SIMD width and group of
109 * channel enable signals, inheriting other code generation parameters
110 * from this.
111 *
112 * \p n gives the default SIMD width, \p i gives the slot group used for
113 * predication and control flow masking in multiples of \p n channels.
114 */
115 fs_builder
116 group(unsigned n, unsigned i) const
117 {
118 assert(force_writemask_all ||
119 (n <= dispatch_width() && i < dispatch_width() / n));
120 fs_builder bld = *this;
121 bld._dispatch_width = n;
122 bld._group += i * n;
123 return bld;
124 }
125
126 /**
127 * Alias for group() with width equal to eight.
128 */
129 fs_builder
130 half(unsigned i) const
131 {
132 return group(8, i);
133 }
134
135 /**
136 * Construct a builder with per-channel control flow execution masking
137 * disabled if \p b is true. If control flow execution masking is
138 * already disabled this has no effect.
139 */
140 fs_builder
141 exec_all(bool b = true) const
142 {
143 fs_builder bld = *this;
144 if (b)
145 bld.force_writemask_all = true;
146 return bld;
147 }
148
149 /**
150 * Construct a builder with the given debug annotation info.
151 */
152 fs_builder
153 annotate(const char *str, const void *ir = NULL) const
154 {
155 fs_builder bld = *this;
156 bld.annotation.str = str;
157 bld.annotation.ir = ir;
158 return bld;
159 }
160
161 /**
162 * Get the SIMD width in use.
163 */
164 unsigned
165 dispatch_width() const
166 {
167 return _dispatch_width;
168 }
169
170 /**
171 * Allocate a virtual register of natural vector size (one for this IR)
172 * and SIMD width. \p n gives the amount of space to allocate in
173 * dispatch_width units (which is just enough space for one logical
174 * component in this IR).
175 */
176 dst_reg
177 vgrf(enum brw_reg_type type, unsigned n = 1) const
178 {
179 assert(dispatch_width() <= 32);
180
181 if (n > 0)
182 return dst_reg(GRF, shader->alloc.allocate(
183 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
184 REG_SIZE)),
185 type);
186 else
187 return retype(null_reg_ud(), type);
188 }
189
190 /**
191 * Create a null register of floating type.
192 */
193 dst_reg
194 null_reg_f() const
195 {
196 return dst_reg(retype(brw_null_vec(dispatch_width()),
197 BRW_REGISTER_TYPE_F));
198 }
199
200 /**
201 * Create a null register of signed integer type.
202 */
203 dst_reg
204 null_reg_d() const
205 {
206 return dst_reg(retype(brw_null_vec(dispatch_width()),
207 BRW_REGISTER_TYPE_D));
208 }
209
210 /**
211 * Create a null register of unsigned integer type.
212 */
213 dst_reg
214 null_reg_ud() const
215 {
216 return dst_reg(retype(brw_null_vec(dispatch_width()),
217 BRW_REGISTER_TYPE_UD));
218 }
219
220 /**
221 * Get the mask of SIMD channels enabled by dispatch and not yet
222 * disabled by discard.
223 */
224 src_reg
225 sample_mask_reg() const
226 {
227 const bool uses_kill =
228 (shader->stage == MESA_SHADER_FRAGMENT &&
229 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
230 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
231 uses_kill ? brw_flag_reg(0, 1) :
232 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
233 }
234
235 /**
236 * Insert an instruction into the program.
237 */
238 instruction *
239 emit(const instruction &inst) const
240 {
241 return emit(new(shader->mem_ctx) instruction(inst));
242 }
243
244 /**
245 * Create and insert a nullary control instruction into the program.
246 */
247 instruction *
248 emit(enum opcode opcode) const
249 {
250 return emit(instruction(opcode, dispatch_width()));
251 }
252
253 /**
254 * Create and insert a nullary instruction into the program.
255 */
256 instruction *
257 emit(enum opcode opcode, const dst_reg &dst) const
258 {
259 return emit(instruction(opcode, dispatch_width(), dst));
260 }
261
262 /**
263 * Create and insert a unary instruction into the program.
264 */
265 instruction *
266 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
267 {
268 switch (opcode) {
269 case SHADER_OPCODE_RCP:
270 case SHADER_OPCODE_RSQ:
271 case SHADER_OPCODE_SQRT:
272 case SHADER_OPCODE_EXP2:
273 case SHADER_OPCODE_LOG2:
274 case SHADER_OPCODE_SIN:
275 case SHADER_OPCODE_COS:
276 return fix_math_instruction(
277 emit(instruction(opcode, dispatch_width(), dst,
278 fix_math_operand(src0))));
279
280 default:
281 return emit(instruction(opcode, dispatch_width(), dst, src0));
282 }
283 }
284
285 /**
286 * Create and insert a binary instruction into the program.
287 */
288 instruction *
289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290 const src_reg &src1) const
291 {
292 switch (opcode) {
293 case SHADER_OPCODE_POW:
294 case SHADER_OPCODE_INT_QUOTIENT:
295 case SHADER_OPCODE_INT_REMAINDER:
296 return fix_math_instruction(
297 emit(instruction(opcode, dispatch_width(), dst,
298 fix_math_operand(src0),
299 fix_math_operand(src1))));
300
301 default:
302 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
303
304 }
305 }
306
307 /**
308 * Create and insert a ternary instruction into the program.
309 */
310 instruction *
311 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
312 const src_reg &src1, const src_reg &src2) const
313 {
314 switch (opcode) {
315 case BRW_OPCODE_BFE:
316 case BRW_OPCODE_BFI2:
317 case BRW_OPCODE_MAD:
318 case BRW_OPCODE_LRP:
319 return emit(instruction(opcode, dispatch_width(), dst,
320 fix_3src_operand(src0),
321 fix_3src_operand(src1),
322 fix_3src_operand(src2)));
323
324 default:
325 return emit(instruction(opcode, dispatch_width(), dst,
326 src0, src1, src2));
327 }
328 }
329
330 /**
331 * Create and insert an instruction with a variable number of sources
332 * into the program.
333 */
334 instruction *
335 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
336 unsigned n) const
337 {
338 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
339 }
340
341 /**
342 * Insert a preallocated instruction into the program.
343 */
344 instruction *
345 emit(instruction *inst) const
346 {
347 assert(inst->exec_size <= 32);
348 assert(inst->exec_size == dispatch_width() ||
349 force_writemask_all);
350 assert(_group == 0 || _group == 8);
351
352 inst->force_sechalf = (_group == 8);
353 inst->force_writemask_all = force_writemask_all;
354 inst->annotation = annotation.str;
355 inst->ir = annotation.ir;
356
357 if (block)
358 static_cast<instruction *>(cursor)->insert_before(block, inst);
359 else
360 cursor->insert_before(inst);
361
362 return inst;
363 }
364
365 /**
366 * Select \p src0 if the comparison of both sources with the given
367 * conditional mod evaluates to true, otherwise select \p src1.
368 *
369 * Generally useful to get the minimum or maximum of two values.
370 */
371 void
372 emit_minmax(const dst_reg &dst, const src_reg &src0,
373 const src_reg &src1, brw_conditional_mod mod) const
374 {
375 if (shader->devinfo->gen >= 6) {
376 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
377 fix_unsigned_negate(src1)));
378 } else {
379 CMP(null_reg_d(), src0, src1, mod);
380 set_predicate(BRW_PREDICATE_NORMAL,
381 SEL(dst, src0, src1));
382 }
383 }
384
385 /**
386 * Copy any live channel from \p src to the first channel of the result.
387 */
388 src_reg
389 emit_uniformize(const src_reg &src) const
390 {
391 const fs_builder ubld = exec_all();
392 const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
393 const dst_reg dst = component(vgrf(src.type), 0);
394
395 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
396 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
397
398 return src_reg(dst);
399 }
400
401 /**
402 * Assorted arithmetic ops.
403 * @{
404 */
405 #define ALU1(op) \
406 instruction * \
407 op(const dst_reg &dst, const src_reg &src0) const \
408 { \
409 return emit(BRW_OPCODE_##op, dst, src0); \
410 }
411
412 #define ALU2(op) \
413 instruction * \
414 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
415 { \
416 return emit(BRW_OPCODE_##op, dst, src0, src1); \
417 }
418
419 #define ALU2_ACC(op) \
420 instruction * \
421 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
422 { \
423 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
424 inst->writes_accumulator = true; \
425 return inst; \
426 }
427
428 #define ALU3(op) \
429 instruction * \
430 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
431 const src_reg &src2) const \
432 { \
433 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
434 }
435
436 ALU2(ADD)
437 ALU2_ACC(ADDC)
438 ALU2(AND)
439 ALU2(ASR)
440 ALU2(AVG)
441 ALU3(BFE)
442 ALU2(BFI1)
443 ALU3(BFI2)
444 ALU1(BFREV)
445 ALU1(CBIT)
446 ALU2(CMPN)
447 ALU3(CSEL)
448 ALU2(DP2)
449 ALU2(DP3)
450 ALU2(DP4)
451 ALU2(DPH)
452 ALU1(F16TO32)
453 ALU1(F32TO16)
454 ALU1(FBH)
455 ALU1(FBL)
456 ALU1(FRC)
457 ALU2(LINE)
458 ALU1(LZD)
459 ALU2(MAC)
460 ALU2_ACC(MACH)
461 ALU3(MAD)
462 ALU1(MOV)
463 ALU2(MUL)
464 ALU1(NOT)
465 ALU2(OR)
466 ALU2(PLN)
467 ALU1(RNDD)
468 ALU1(RNDE)
469 ALU1(RNDU)
470 ALU1(RNDZ)
471 ALU2(SAD2)
472 ALU2_ACC(SADA2)
473 ALU2(SEL)
474 ALU2(SHL)
475 ALU2(SHR)
476 ALU2_ACC(SUBB)
477 ALU2(XOR)
478
479 #undef ALU3
480 #undef ALU2_ACC
481 #undef ALU2
482 #undef ALU1
483 /** @} */
484
485 /**
486 * CMP: Sets the low bit of the destination channels with the result
487 * of the comparison, while the upper bits are undefined, and updates
488 * the flag register with the packed 16 bits of the result.
489 */
490 instruction *
491 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
492 brw_conditional_mod condition) const
493 {
494 /* Take the instruction:
495 *
496 * CMP null<d> src0<f> src1<f>
497 *
498 * Original gen4 does type conversion to the destination type
499 * before comparison, producing garbage results for floating
500 * point comparisons.
501 *
502 * The destination type doesn't matter on newer generations,
503 * so we set the type to match src0 so we can compact the
504 * instruction.
505 */
506 return set_condmod(condition,
507 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
508 fix_unsigned_negate(src0),
509 fix_unsigned_negate(src1)));
510 }
511
512 /**
513 * Gen4 predicated IF.
514 */
515 instruction *
516 IF(brw_predicate predicate) const
517 {
518 return set_predicate(predicate, emit(BRW_OPCODE_IF));
519 }
520
521 /**
522 * Emit a linear interpolation instruction.
523 */
524 instruction *
525 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
526 const src_reg &a) const
527 {
528 if (shader->devinfo->gen >= 6) {
529 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
530 * we need to reorder the operands.
531 */
532 return emit(BRW_OPCODE_LRP, dst, a, y, x);
533
534 } else {
535 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
536 const dst_reg y_times_a = vgrf(dst.type);
537 const dst_reg one_minus_a = vgrf(dst.type);
538 const dst_reg x_times_one_minus_a = vgrf(dst.type);
539
540 MUL(y_times_a, y, a);
541 ADD(one_minus_a, negate(a), src_reg(1.0f));
542 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
543 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
544 }
545 }
546
547 /**
548 * Collect a number of registers in a contiguous range of registers.
549 */
550 instruction *
551 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
552 unsigned sources, unsigned header_size) const
553 {
554 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
555 inst->header_size = header_size;
556 inst->regs_written = header_size +
557 (sources - header_size) * (dispatch_width() / 8);
558
559 return inst;
560 }
561
562 backend_shader *shader;
563
564 private:
565 /**
566 * Workaround for negation of UD registers. See comment in
567 * fs_generator::generate_code() for more details.
568 */
569 src_reg
570 fix_unsigned_negate(const src_reg &src) const
571 {
572 if (src.type == BRW_REGISTER_TYPE_UD &&
573 src.negate) {
574 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
575 MOV(temp, src);
576 return src_reg(temp);
577 } else {
578 return src;
579 }
580 }
581
582 /**
583 * Workaround for source register modes not supported by the ternary
584 * instruction encoding.
585 */
586 src_reg
587 fix_3src_operand(const src_reg &src) const
588 {
589 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
590 return src;
591 } else {
592 dst_reg expanded = vgrf(src.type);
593 MOV(expanded, src);
594 return expanded;
595 }
596 }
597
598 /**
599 * Workaround for source register modes not supported by the math
600 * instruction.
601 */
602 src_reg
603 fix_math_operand(const src_reg &src) const
604 {
605 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
606 * might be able to do better by doing execsize = 1 math and then
607 * expanding that result out, but we would need to be careful with
608 * masking.
609 *
610 * Gen6 hardware ignores source modifiers (negate and abs) on math
611 * instructions, so we also move to a temp to set those up.
612 *
613 * Gen7 relaxes most of the above restrictions, but still can't use IMM
614 * operands to math
615 */
616 if ((shader->devinfo->gen == 6 &&
617 (src.file == IMM || src.file == UNIFORM ||
618 src.abs || src.negate)) ||
619 (shader->devinfo->gen == 7 && src.file == IMM)) {
620 const dst_reg tmp = vgrf(src.type);
621 MOV(tmp, src);
622 return tmp;
623 } else {
624 return src;
625 }
626 }
627
628 /**
629 * Workaround other weirdness of the math instruction.
630 */
631 instruction *
632 fix_math_instruction(instruction *inst) const
633 {
634 if (shader->devinfo->gen < 6) {
635 inst->base_mrf = 2;
636 inst->mlen = inst->sources * dispatch_width() / 8;
637
638 if (inst->sources > 1) {
639 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
640 * "Message Payload":
641 *
642 * "Operand0[7]. For the INT DIV functions, this operand is the
643 * denominator."
644 * ...
645 * "Operand1[7]. For the INT DIV functions, this operand is the
646 * numerator."
647 */
648 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
649 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
650 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
651
652 inst->resize_sources(1);
653 inst->src[0] = src0;
654
655 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
656 src1);
657 }
658 }
659
660 return inst;
661 }
662
663 bblock_t *block;
664 exec_node *cursor;
665
666 unsigned _dispatch_width;
667 unsigned _group;
668 bool force_writemask_all;
669
670 /** Debug annotation info. */
671 struct {
672 const char *str;
673 const void *ir;
674 } annotation;
675 };
676 }
677
678 #endif