i965/fs_builder: Use the dispatch width for setting exec sizes
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions before \p cursor in
68 * basic block \p block, inheriting other code generation parameters
69 * from this.
70 */
71 fs_builder
72 at(bblock_t *block, exec_node *cursor) const
73 {
74 fs_builder bld = *this;
75 bld.block = block;
76 bld.cursor = cursor;
77 return bld;
78 }
79
80 /**
81 * Construct an fs_builder appending instructions at the end of the
82 * instruction list of the shader, inheriting other code generation
83 * parameters from this.
84 */
85 fs_builder
86 at_end() const
87 {
88 return at(NULL, (exec_node *)&shader->instructions.tail);
89 }
90
91 /**
92 * Construct a builder specifying the default SIMD width and group of
93 * channel enable signals, inheriting other code generation parameters
94 * from this.
95 *
96 * \p n gives the default SIMD width, \p i gives the slot group used for
97 * predication and control flow masking in multiples of \p n channels.
98 */
99 fs_builder
100 group(unsigned n, unsigned i) const
101 {
102 assert(n <= dispatch_width() &&
103 i < dispatch_width() / n);
104 fs_builder bld = *this;
105 bld._dispatch_width = n;
106 bld._group += i * n;
107 return bld;
108 }
109
110 /**
111 * Alias for group() with width equal to eight.
112 */
113 fs_builder
114 half(unsigned i) const
115 {
116 return group(8, i);
117 }
118
119 /**
120 * Construct a builder with per-channel control flow execution masking
121 * disabled if \p b is true. If control flow execution masking is
122 * already disabled this has no effect.
123 */
124 fs_builder
125 exec_all(bool b = true) const
126 {
127 fs_builder bld = *this;
128 if (b)
129 bld.force_writemask_all = true;
130 return bld;
131 }
132
133 /**
134 * Construct a builder with the given debug annotation info.
135 */
136 fs_builder
137 annotate(const char *str, const void *ir = NULL) const
138 {
139 fs_builder bld = *this;
140 bld.annotation.str = str;
141 bld.annotation.ir = ir;
142 return bld;
143 }
144
145 /**
146 * Get the SIMD width in use.
147 */
148 unsigned
149 dispatch_width() const
150 {
151 return _dispatch_width;
152 }
153
154 /**
155 * Allocate a virtual register of natural vector size (one for this IR)
156 * and SIMD width. \p n gives the amount of space to allocate in
157 * dispatch_width units (which is just enough space for one logical
158 * component in this IR).
159 */
160 dst_reg
161 vgrf(enum brw_reg_type type, unsigned n = 1) const
162 {
163 return dst_reg(GRF, shader->alloc.allocate(
164 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
165 REG_SIZE)),
166 type, dispatch_width());
167 }
168
169 /**
170 * Create a null register of floating type.
171 */
172 dst_reg
173 null_reg_f() const
174 {
175 return dst_reg(retype(brw_null_vec(dispatch_width()),
176 BRW_REGISTER_TYPE_F));
177 }
178
179 /**
180 * Create a null register of signed integer type.
181 */
182 dst_reg
183 null_reg_d() const
184 {
185 return dst_reg(retype(brw_null_vec(dispatch_width()),
186 BRW_REGISTER_TYPE_D));
187 }
188
189 /**
190 * Create a null register of unsigned integer type.
191 */
192 dst_reg
193 null_reg_ud() const
194 {
195 return dst_reg(retype(brw_null_vec(dispatch_width()),
196 BRW_REGISTER_TYPE_UD));
197 }
198
199 /**
200 * Get the mask of SIMD channels enabled by dispatch and not yet
201 * disabled by discard.
202 */
203 src_reg
204 sample_mask_reg() const
205 {
206 const bool uses_kill =
207 (shader->stage == MESA_SHADER_FRAGMENT &&
208 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
209 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
210 uses_kill ? brw_flag_reg(0, 1) :
211 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
212 }
213
214 /**
215 * Insert an instruction into the program.
216 */
217 instruction *
218 emit(const instruction &inst) const
219 {
220 return emit(new(shader->mem_ctx) instruction(inst));
221 }
222
223 /**
224 * Create and insert a nullary control instruction into the program.
225 */
226 instruction *
227 emit(enum opcode opcode) const
228 {
229 return emit(instruction(opcode, dispatch_width()));
230 }
231
232 /**
233 * Create and insert a nullary instruction into the program.
234 */
235 instruction *
236 emit(enum opcode opcode, const dst_reg &dst) const
237 {
238 return emit(instruction(opcode, dispatch_width(), dst));
239 }
240
241 /**
242 * Create and insert a unary instruction into the program.
243 */
244 instruction *
245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246 {
247 switch (opcode) {
248 case SHADER_OPCODE_RCP:
249 case SHADER_OPCODE_RSQ:
250 case SHADER_OPCODE_SQRT:
251 case SHADER_OPCODE_EXP2:
252 case SHADER_OPCODE_LOG2:
253 case SHADER_OPCODE_SIN:
254 case SHADER_OPCODE_COS:
255 return fix_math_instruction(
256 emit(instruction(opcode, dispatch_width(), dst,
257 fix_math_operand(src0))));
258
259 default:
260 return emit(instruction(opcode, dispatch_width(), dst, src0));
261 }
262 }
263
264 /**
265 * Create and insert a binary instruction into the program.
266 */
267 instruction *
268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269 const src_reg &src1) const
270 {
271 switch (opcode) {
272 case SHADER_OPCODE_POW:
273 case SHADER_OPCODE_INT_QUOTIENT:
274 case SHADER_OPCODE_INT_REMAINDER:
275 return fix_math_instruction(
276 emit(instruction(opcode, dispatch_width(), dst,
277 fix_math_operand(src0),
278 fix_math_operand(src1))));
279
280 default:
281 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
282
283 }
284 }
285
286 /**
287 * Create and insert a ternary instruction into the program.
288 */
289 instruction *
290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291 const src_reg &src1, const src_reg &src2) const
292 {
293 switch (opcode) {
294 case BRW_OPCODE_BFE:
295 case BRW_OPCODE_BFI2:
296 case BRW_OPCODE_MAD:
297 case BRW_OPCODE_LRP:
298 return emit(instruction(opcode, dispatch_width(), dst,
299 fix_3src_operand(src0),
300 fix_3src_operand(src1),
301 fix_3src_operand(src2)));
302
303 default:
304 return emit(instruction(opcode, dispatch_width(), dst,
305 src0, src1, src2));
306 }
307 }
308
309 /**
310 * Insert a preallocated instruction into the program.
311 */
312 instruction *
313 emit(instruction *inst) const
314 {
315 assert(inst->exec_size == dispatch_width() ||
316 force_writemask_all);
317 assert(_group == 0 || _group == 8);
318
319 inst->force_sechalf = (_group == 8);
320 inst->force_writemask_all = force_writemask_all;
321 inst->annotation = annotation.str;
322 inst->ir = annotation.ir;
323
324 if (block)
325 static_cast<instruction *>(cursor)->insert_before(block, inst);
326 else
327 cursor->insert_before(inst);
328
329 return inst;
330 }
331
332 /**
333 * Select \p src0 if the comparison of both sources with the given
334 * conditional mod evaluates to true, otherwise select \p src1.
335 *
336 * Generally useful to get the minimum or maximum of two values.
337 */
338 void
339 emit_minmax(const dst_reg &dst, const src_reg &src0,
340 const src_reg &src1, brw_conditional_mod mod) const
341 {
342 if (shader->devinfo->gen >= 6) {
343 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
344 fix_unsigned_negate(src1)));
345 } else {
346 CMP(null_reg_d(), src0, src1, mod);
347 set_predicate(BRW_PREDICATE_NORMAL,
348 SEL(dst, src0, src1));
349 }
350 }
351
352 /**
353 * Copy any live channel from \p src to the first channel of \p dst.
354 */
355 void
356 emit_uniformize(const dst_reg &dst, const src_reg &src) const
357 {
358 const fs_builder ubld = exec_all();
359 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
360
361 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
362 ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
363 src, component(chan_index, 0));
364 }
365
366 /**
367 * Assorted arithmetic ops.
368 * @{
369 */
370 #define ALU1(op) \
371 instruction * \
372 op(const dst_reg &dst, const src_reg &src0) const \
373 { \
374 return emit(BRW_OPCODE_##op, dst, src0); \
375 }
376
377 #define ALU2(op) \
378 instruction * \
379 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
380 { \
381 return emit(BRW_OPCODE_##op, dst, src0, src1); \
382 }
383
384 #define ALU2_ACC(op) \
385 instruction * \
386 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
387 { \
388 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
389 inst->writes_accumulator = true; \
390 return inst; \
391 }
392
393 #define ALU3(op) \
394 instruction * \
395 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
396 const src_reg &src2) const \
397 { \
398 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
399 }
400
401 ALU2(ADD)
402 ALU2_ACC(ADDC)
403 ALU2(AND)
404 ALU2(ASR)
405 ALU2(AVG)
406 ALU3(BFE)
407 ALU2(BFI1)
408 ALU3(BFI2)
409 ALU1(BFREV)
410 ALU1(CBIT)
411 ALU2(CMPN)
412 ALU3(CSEL)
413 ALU2(DP2)
414 ALU2(DP3)
415 ALU2(DP4)
416 ALU2(DPH)
417 ALU1(F16TO32)
418 ALU1(F32TO16)
419 ALU1(FBH)
420 ALU1(FBL)
421 ALU1(FRC)
422 ALU2(LINE)
423 ALU1(LZD)
424 ALU2(MAC)
425 ALU2_ACC(MACH)
426 ALU3(MAD)
427 ALU1(MOV)
428 ALU2(MUL)
429 ALU1(NOT)
430 ALU2(OR)
431 ALU2(PLN)
432 ALU1(RNDD)
433 ALU1(RNDE)
434 ALU1(RNDU)
435 ALU1(RNDZ)
436 ALU2(SAD2)
437 ALU2_ACC(SADA2)
438 ALU2(SEL)
439 ALU2(SHL)
440 ALU2(SHR)
441 ALU2_ACC(SUBB)
442 ALU2(XOR)
443
444 #undef ALU3
445 #undef ALU2_ACC
446 #undef ALU2
447 #undef ALU1
448 /** @} */
449
450 /**
451 * CMP: Sets the low bit of the destination channels with the result
452 * of the comparison, while the upper bits are undefined, and updates
453 * the flag register with the packed 16 bits of the result.
454 */
455 instruction *
456 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
457 brw_conditional_mod condition) const
458 {
459 /* Take the instruction:
460 *
461 * CMP null<d> src0<f> src1<f>
462 *
463 * Original gen4 does type conversion to the destination type
464 * before comparison, producing garbage results for floating
465 * point comparisons.
466 *
467 * The destination type doesn't matter on newer generations,
468 * so we set the type to match src0 so we can compact the
469 * instruction.
470 */
471 return set_condmod(condition,
472 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
473 fix_unsigned_negate(src0),
474 fix_unsigned_negate(src1)));
475 }
476
477 /**
478 * Gen4 predicated IF.
479 */
480 instruction *
481 IF(brw_predicate predicate) const
482 {
483 return set_predicate(predicate, emit(BRW_OPCODE_IF));
484 }
485
486 /**
487 * Emit a linear interpolation instruction.
488 */
489 instruction *
490 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
491 const src_reg &a) const
492 {
493 if (shader->devinfo->gen >= 6) {
494 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
495 * we need to reorder the operands.
496 */
497 return emit(BRW_OPCODE_LRP, dst, a, y, x);
498
499 } else {
500 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
501 const dst_reg y_times_a = vgrf(dst.type);
502 const dst_reg one_minus_a = vgrf(dst.type);
503 const dst_reg x_times_one_minus_a = vgrf(dst.type);
504
505 MUL(y_times_a, y, a);
506 ADD(one_minus_a, negate(a), src_reg(1.0f));
507 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
508 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
509 }
510 }
511
512 /**
513 * Collect a number of registers in a contiguous range of registers.
514 */
515 instruction *
516 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
517 unsigned sources, unsigned header_size) const
518 {
519 assert(dst.width % 8 == 0);
520 instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
521 dispatch_width(), dst,
522 src, sources));
523 inst->header_size = header_size;
524
525 for (unsigned i = 0; i < header_size; i++)
526 assert(src[i].file != GRF ||
527 src[i].width * type_sz(src[i].type) == 32);
528 inst->regs_written = header_size;
529
530 for (unsigned i = header_size; i < sources; ++i)
531 assert(src[i].file != GRF ||
532 src[i].width == dst.width);
533 inst->regs_written += (sources - header_size) * (dispatch_width() / 8);
534
535 return inst;
536 }
537
538 backend_shader *shader;
539
540 private:
541 /**
542 * Workaround for negation of UD registers. See comment in
543 * fs_generator::generate_code() for more details.
544 */
545 src_reg
546 fix_unsigned_negate(const src_reg &src) const
547 {
548 if (src.type == BRW_REGISTER_TYPE_UD &&
549 src.negate) {
550 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
551 MOV(temp, src);
552 return src_reg(temp);
553 } else {
554 return src;
555 }
556 }
557
558 /**
559 * Workaround for source register modes not supported by the ternary
560 * instruction encoding.
561 */
562 src_reg
563 fix_3src_operand(const src_reg &src) const
564 {
565 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
566 return src;
567 } else {
568 dst_reg expanded = vgrf(src.type);
569 MOV(expanded, src);
570 return expanded;
571 }
572 }
573
574 /**
575 * Workaround for source register modes not supported by the math
576 * instruction.
577 */
578 src_reg
579 fix_math_operand(const src_reg &src) const
580 {
581 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
582 * might be able to do better by doing execsize = 1 math and then
583 * expanding that result out, but we would need to be careful with
584 * masking.
585 *
586 * Gen6 hardware ignores source modifiers (negate and abs) on math
587 * instructions, so we also move to a temp to set those up.
588 *
589 * Gen7 relaxes most of the above restrictions, but still can't use IMM
590 * operands to math
591 */
592 if ((shader->devinfo->gen == 6 &&
593 (src.file == IMM || src.file == UNIFORM ||
594 src.abs || src.negate)) ||
595 (shader->devinfo->gen == 7 && src.file == IMM)) {
596 const dst_reg tmp = vgrf(src.type);
597 MOV(tmp, src);
598 return tmp;
599 } else {
600 return src;
601 }
602 }
603
604 /**
605 * Workaround other weirdness of the math instruction.
606 */
607 instruction *
608 fix_math_instruction(instruction *inst) const
609 {
610 if (shader->devinfo->gen < 6) {
611 inst->base_mrf = 2;
612 inst->mlen = inst->sources * dispatch_width() / 8;
613
614 if (inst->sources > 1) {
615 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
616 * "Message Payload":
617 *
618 * "Operand0[7]. For the INT DIV functions, this operand is the
619 * denominator."
620 * ...
621 * "Operand1[7]. For the INT DIV functions, this operand is the
622 * numerator."
623 */
624 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
625 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
626 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
627
628 inst->resize_sources(1);
629 inst->src[0] = src0;
630
631 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
632 dispatch_width()), src1);
633 }
634 }
635
636 return inst;
637 }
638
639 bblock_t *block;
640 exec_node *cursor;
641
642 unsigned _dispatch_width;
643 unsigned _group;
644 bool force_writemask_all;
645
646 /** Debug annotation info. */
647 struct {
648 const char *str;
649 const void *ir;
650 } annotation;
651 };
652 }
653
654 #endif