i965: Move the back-end compiler to src/intel/compiler
[mesa.git] / src / intel / compiler / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30
31 namespace brw {
32 /**
33 * Toolbox to assemble an FS IR program out of individual instructions.
34 *
35 * This object is meant to have an interface consistent with
36 * brw::vec4_builder. They cannot be fully interchangeable because
37 * brw::fs_builder generates scalar code while brw::vec4_builder generates
38 * vector code.
39 */
40 class fs_builder {
41 public:
42 /** Type used in this IR to represent a source of an instruction. */
43 typedef fs_reg src_reg;
44
45 /** Type used in this IR to represent the destination of an instruction. */
46 typedef fs_reg dst_reg;
47
48 /** Type used in this IR to represent an instruction. */
49 typedef fs_inst instruction;
50
51 /**
52 * Construct an fs_builder that inserts instructions into \p shader.
53 * \p dispatch_width gives the native execution width of the program.
54 */
55 fs_builder(backend_shader *shader,
56 unsigned dispatch_width) :
57 shader(shader), block(NULL), cursor(NULL),
58 _dispatch_width(dispatch_width),
59 _group(0),
60 force_writemask_all(false),
61 annotation()
62 {
63 }
64
65 /**
66 * Construct an fs_builder that inserts instructions into \p shader
67 * before instruction \p inst in basic block \p block. The default
68 * execution controls and debug annotation are initialized from the
69 * instruction passed as argument.
70 */
71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72 shader(shader), block(block), cursor(inst),
73 _dispatch_width(inst->exec_size),
74 _group(inst->group),
75 force_writemask_all(inst->force_writemask_all)
76 {
77 annotation.str = inst->annotation;
78 annotation.ir = inst->ir;
79 }
80
81 /**
82 * Construct an fs_builder that inserts instructions before \p cursor in
83 * basic block \p block, inheriting other code generation parameters
84 * from this.
85 */
86 fs_builder
87 at(bblock_t *block, exec_node *cursor) const
88 {
89 fs_builder bld = *this;
90 bld.block = block;
91 bld.cursor = cursor;
92 return bld;
93 }
94
95 /**
96 * Construct an fs_builder appending instructions at the end of the
97 * instruction list of the shader, inheriting other code generation
98 * parameters from this.
99 */
100 fs_builder
101 at_end() const
102 {
103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104 }
105
106 /**
107 * Construct a builder specifying the default SIMD width and group of
108 * channel enable signals, inheriting other code generation parameters
109 * from this.
110 *
111 * \p n gives the default SIMD width, \p i gives the slot group used for
112 * predication and control flow masking in multiples of \p n channels.
113 */
114 fs_builder
115 group(unsigned n, unsigned i) const
116 {
117 assert(force_writemask_all ||
118 (n <= dispatch_width() && i < dispatch_width() / n));
119 fs_builder bld = *this;
120 bld._dispatch_width = n;
121 bld._group += i * n;
122 return bld;
123 }
124
125 /**
126 * Alias for group() with width equal to eight.
127 */
128 fs_builder
129 half(unsigned i) const
130 {
131 return group(8, i);
132 }
133
134 /**
135 * Construct a builder with per-channel control flow execution masking
136 * disabled if \p b is true. If control flow execution masking is
137 * already disabled this has no effect.
138 */
139 fs_builder
140 exec_all(bool b = true) const
141 {
142 fs_builder bld = *this;
143 if (b)
144 bld.force_writemask_all = true;
145 return bld;
146 }
147
148 /**
149 * Construct a builder with the given debug annotation info.
150 */
151 fs_builder
152 annotate(const char *str, const void *ir = NULL) const
153 {
154 fs_builder bld = *this;
155 bld.annotation.str = str;
156 bld.annotation.ir = ir;
157 return bld;
158 }
159
160 /**
161 * Get the SIMD width in use.
162 */
163 unsigned
164 dispatch_width() const
165 {
166 return _dispatch_width;
167 }
168
169 /**
170 * Get the channel group in use.
171 */
172 unsigned
173 group() const
174 {
175 return _group;
176 }
177
178 /**
179 * Allocate a virtual register of natural vector size (one for this IR)
180 * and SIMD width. \p n gives the amount of space to allocate in
181 * dispatch_width units (which is just enough space for one logical
182 * component in this IR).
183 */
184 dst_reg
185 vgrf(enum brw_reg_type type, unsigned n = 1) const
186 {
187 assert(dispatch_width() <= 32);
188
189 if (n > 0)
190 return dst_reg(VGRF, shader->alloc.allocate(
191 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
192 REG_SIZE)),
193 type);
194 else
195 return retype(null_reg_ud(), type);
196 }
197
198 /**
199 * Create a null register of floating type.
200 */
201 dst_reg
202 null_reg_f() const
203 {
204 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
205 }
206
207 dst_reg
208 null_reg_df() const
209 {
210 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
211 }
212
213 /**
214 * Create a null register of signed integer type.
215 */
216 dst_reg
217 null_reg_d() const
218 {
219 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
220 }
221
222 /**
223 * Create a null register of unsigned integer type.
224 */
225 dst_reg
226 null_reg_ud() const
227 {
228 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
229 }
230
231 /**
232 * Get the mask of SIMD channels enabled by dispatch and not yet
233 * disabled by discard.
234 */
235 src_reg
236 sample_mask_reg() const
237 {
238 assert(shader->stage != MESA_SHADER_FRAGMENT ||
239 group() + dispatch_width() <= 16);
240 if (shader->stage != MESA_SHADER_FRAGMENT) {
241 return brw_imm_d(0xffffffff);
242 } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
243 return brw_flag_reg(0, 1);
244 } else {
245 return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
246 }
247 }
248
249 /**
250 * Insert an instruction into the program.
251 */
252 instruction *
253 emit(const instruction &inst) const
254 {
255 return emit(new(shader->mem_ctx) instruction(inst));
256 }
257
258 /**
259 * Create and insert a nullary control instruction into the program.
260 */
261 instruction *
262 emit(enum opcode opcode) const
263 {
264 return emit(instruction(opcode, dispatch_width()));
265 }
266
267 /**
268 * Create and insert a nullary instruction into the program.
269 */
270 instruction *
271 emit(enum opcode opcode, const dst_reg &dst) const
272 {
273 return emit(instruction(opcode, dispatch_width(), dst));
274 }
275
276 /**
277 * Create and insert a unary instruction into the program.
278 */
279 instruction *
280 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
281 {
282 switch (opcode) {
283 case SHADER_OPCODE_RCP:
284 case SHADER_OPCODE_RSQ:
285 case SHADER_OPCODE_SQRT:
286 case SHADER_OPCODE_EXP2:
287 case SHADER_OPCODE_LOG2:
288 case SHADER_OPCODE_SIN:
289 case SHADER_OPCODE_COS:
290 return emit(instruction(opcode, dispatch_width(), dst,
291 fix_math_operand(src0)));
292
293 default:
294 return emit(instruction(opcode, dispatch_width(), dst, src0));
295 }
296 }
297
298 /**
299 * Create and insert a binary instruction into the program.
300 */
301 instruction *
302 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
303 const src_reg &src1) const
304 {
305 switch (opcode) {
306 case SHADER_OPCODE_POW:
307 case SHADER_OPCODE_INT_QUOTIENT:
308 case SHADER_OPCODE_INT_REMAINDER:
309 return emit(instruction(opcode, dispatch_width(), dst,
310 fix_math_operand(src0),
311 fix_math_operand(src1)));
312
313 default:
314 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
315
316 }
317 }
318
319 /**
320 * Create and insert a ternary instruction into the program.
321 */
322 instruction *
323 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
324 const src_reg &src1, const src_reg &src2) const
325 {
326 switch (opcode) {
327 case BRW_OPCODE_BFE:
328 case BRW_OPCODE_BFI2:
329 case BRW_OPCODE_MAD:
330 case BRW_OPCODE_LRP:
331 return emit(instruction(opcode, dispatch_width(), dst,
332 fix_3src_operand(src0),
333 fix_3src_operand(src1),
334 fix_3src_operand(src2)));
335
336 default:
337 return emit(instruction(opcode, dispatch_width(), dst,
338 src0, src1, src2));
339 }
340 }
341
342 /**
343 * Create and insert an instruction with a variable number of sources
344 * into the program.
345 */
346 instruction *
347 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
348 unsigned n) const
349 {
350 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
351 }
352
353 /**
354 * Insert a preallocated instruction into the program.
355 */
356 instruction *
357 emit(instruction *inst) const
358 {
359 assert(inst->exec_size <= 32);
360 assert(inst->exec_size == dispatch_width() ||
361 force_writemask_all);
362
363 inst->group = _group;
364 inst->force_writemask_all = force_writemask_all;
365 inst->annotation = annotation.str;
366 inst->ir = annotation.ir;
367
368 if (block)
369 static_cast<instruction *>(cursor)->insert_before(block, inst);
370 else
371 cursor->insert_before(inst);
372
373 return inst;
374 }
375
376 /**
377 * Select \p src0 if the comparison of both sources with the given
378 * conditional mod evaluates to true, otherwise select \p src1.
379 *
380 * Generally useful to get the minimum or maximum of two values.
381 */
382 instruction *
383 emit_minmax(const dst_reg &dst, const src_reg &src0,
384 const src_reg &src1, brw_conditional_mod mod) const
385 {
386 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
387
388 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
389 fix_unsigned_negate(src1)));
390 }
391
392 /**
393 * Copy any live channel from \p src to the first channel of the result.
394 */
395 src_reg
396 emit_uniformize(const src_reg &src) const
397 {
398 /* FIXME: We use a vector chan_index and dst to allow constant and
399 * copy propagration to move result all the way into the consuming
400 * instruction (typically a surface index or sampler index for a
401 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
402 * dispatch. Once we teach const/copy propagation about scalars we
403 * should go back to scalar destinations here.
404 */
405 const fs_builder ubld = exec_all();
406 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
407 const dst_reg dst = vgrf(src.type);
408
409 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
410 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
411
412 return src_reg(component(dst, 0));
413 }
414
415 /**
416 * Assorted arithmetic ops.
417 * @{
418 */
419 #define ALU1(op) \
420 instruction * \
421 op(const dst_reg &dst, const src_reg &src0) const \
422 { \
423 return emit(BRW_OPCODE_##op, dst, src0); \
424 }
425
426 #define ALU2(op) \
427 instruction * \
428 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
429 { \
430 return emit(BRW_OPCODE_##op, dst, src0, src1); \
431 }
432
433 #define ALU2_ACC(op) \
434 instruction * \
435 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
436 { \
437 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
438 inst->writes_accumulator = true; \
439 return inst; \
440 }
441
442 #define ALU3(op) \
443 instruction * \
444 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
445 const src_reg &src2) const \
446 { \
447 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
448 }
449
450 ALU2(ADD)
451 ALU2_ACC(ADDC)
452 ALU2(AND)
453 ALU2(ASR)
454 ALU2(AVG)
455 ALU3(BFE)
456 ALU2(BFI1)
457 ALU3(BFI2)
458 ALU1(BFREV)
459 ALU1(CBIT)
460 ALU2(CMPN)
461 ALU3(CSEL)
462 ALU1(DIM)
463 ALU2(DP2)
464 ALU2(DP3)
465 ALU2(DP4)
466 ALU2(DPH)
467 ALU1(F16TO32)
468 ALU1(F32TO16)
469 ALU1(FBH)
470 ALU1(FBL)
471 ALU1(FRC)
472 ALU2(LINE)
473 ALU1(LZD)
474 ALU2(MAC)
475 ALU2_ACC(MACH)
476 ALU3(MAD)
477 ALU1(MOV)
478 ALU2(MUL)
479 ALU1(NOT)
480 ALU2(OR)
481 ALU2(PLN)
482 ALU1(RNDD)
483 ALU1(RNDE)
484 ALU1(RNDU)
485 ALU1(RNDZ)
486 ALU2(SAD2)
487 ALU2_ACC(SADA2)
488 ALU2(SEL)
489 ALU2(SHL)
490 ALU2(SHR)
491 ALU2_ACC(SUBB)
492 ALU2(XOR)
493
494 #undef ALU3
495 #undef ALU2_ACC
496 #undef ALU2
497 #undef ALU1
498 /** @} */
499
500 /**
501 * CMP: Sets the low bit of the destination channels with the result
502 * of the comparison, while the upper bits are undefined, and updates
503 * the flag register with the packed 16 bits of the result.
504 */
505 instruction *
506 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
507 brw_conditional_mod condition) const
508 {
509 /* Take the instruction:
510 *
511 * CMP null<d> src0<f> src1<f>
512 *
513 * Original gen4 does type conversion to the destination type
514 * before comparison, producing garbage results for floating
515 * point comparisons.
516 *
517 * The destination type doesn't matter on newer generations,
518 * so we set the type to match src0 so we can compact the
519 * instruction.
520 */
521 return set_condmod(condition,
522 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
523 fix_unsigned_negate(src0),
524 fix_unsigned_negate(src1)));
525 }
526
527 /**
528 * Gen4 predicated IF.
529 */
530 instruction *
531 IF(brw_predicate predicate) const
532 {
533 return set_predicate(predicate, emit(BRW_OPCODE_IF));
534 }
535
536 /**
537 * Emit a linear interpolation instruction.
538 */
539 instruction *
540 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
541 const src_reg &a) const
542 {
543 if (shader->devinfo->gen >= 6) {
544 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
545 * we need to reorder the operands.
546 */
547 return emit(BRW_OPCODE_LRP, dst, a, y, x);
548
549 } else {
550 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
551 const dst_reg y_times_a = vgrf(dst.type);
552 const dst_reg one_minus_a = vgrf(dst.type);
553 const dst_reg x_times_one_minus_a = vgrf(dst.type);
554
555 MUL(y_times_a, y, a);
556 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
557 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
558 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
559 }
560 }
561
562 /**
563 * Collect a number of registers in a contiguous range of registers.
564 */
565 instruction *
566 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
567 unsigned sources, unsigned header_size) const
568 {
569 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
570 inst->header_size = header_size;
571 inst->size_written = header_size * REG_SIZE;
572 for (unsigned i = header_size; i < sources; i++) {
573 inst->size_written +=
574 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
575 REG_SIZE);
576 }
577
578 return inst;
579 }
580
581 backend_shader *shader;
582
583 private:
584 /**
585 * Workaround for negation of UD registers. See comment in
586 * fs_generator::generate_code() for more details.
587 */
588 src_reg
589 fix_unsigned_negate(const src_reg &src) const
590 {
591 if (src.type == BRW_REGISTER_TYPE_UD &&
592 src.negate) {
593 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
594 MOV(temp, src);
595 return src_reg(temp);
596 } else {
597 return src;
598 }
599 }
600
601 /**
602 * Workaround for source register modes not supported by the ternary
603 * instruction encoding.
604 */
605 src_reg
606 fix_3src_operand(const src_reg &src) const
607 {
608 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
609 return src;
610 } else {
611 dst_reg expanded = vgrf(src.type);
612 MOV(expanded, src);
613 return expanded;
614 }
615 }
616
617 /**
618 * Workaround for source register modes not supported by the math
619 * instruction.
620 */
621 src_reg
622 fix_math_operand(const src_reg &src) const
623 {
624 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
625 * might be able to do better by doing execsize = 1 math and then
626 * expanding that result out, but we would need to be careful with
627 * masking.
628 *
629 * Gen6 hardware ignores source modifiers (negate and abs) on math
630 * instructions, so we also move to a temp to set those up.
631 *
632 * Gen7 relaxes most of the above restrictions, but still can't use IMM
633 * operands to math
634 */
635 if ((shader->devinfo->gen == 6 &&
636 (src.file == IMM || src.file == UNIFORM ||
637 src.abs || src.negate)) ||
638 (shader->devinfo->gen == 7 && src.file == IMM)) {
639 const dst_reg tmp = vgrf(src.type);
640 MOV(tmp, src);
641 return tmp;
642 } else {
643 return src;
644 }
645 }
646
647 bblock_t *block;
648 exec_node *cursor;
649
650 unsigned _dispatch_width;
651 unsigned _group;
652 bool force_writemask_all;
653
654 /** Debug annotation info. */
655 struct {
656 const char *str;
657 const void *ir;
658 } annotation;
659 };
660 }
661
662 #endif