glsl: Lower UBO and SSBO access in glsl linker
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_builder.h
1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31
32 namespace brw {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * brw::vec4_builder. They cannot be fully interchangeable because
38 * brw::fs_builder generates scalar code while brw::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
56 fs_builder(backend_shader *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
66 /**
67 * Construct an fs_builder that inserts instructions into \p shader
68 * before instruction \p inst in basic block \p block. The default
69 * execution controls and debug annotation are initialized from the
70 * instruction passed as argument.
71 */
72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73 shader(shader), block(block), cursor(inst),
74 _dispatch_width(inst->exec_size),
75 _group(inst->force_sechalf ? 8 : 0),
76 force_writemask_all(inst->force_writemask_all)
77 {
78 annotation.str = inst->annotation;
79 annotation.ir = inst->ir;
80 }
81
82 /**
83 * Construct an fs_builder that inserts instructions before \p cursor in
84 * basic block \p block, inheriting other code generation parameters
85 * from this.
86 */
87 fs_builder
88 at(bblock_t *block, exec_node *cursor) const
89 {
90 fs_builder bld = *this;
91 bld.block = block;
92 bld.cursor = cursor;
93 return bld;
94 }
95
96 /**
97 * Construct an fs_builder appending instructions at the end of the
98 * instruction list of the shader, inheriting other code generation
99 * parameters from this.
100 */
101 fs_builder
102 at_end() const
103 {
104 return at(NULL, (exec_node *)&shader->instructions.tail);
105 }
106
107 /**
108 * Construct a builder specifying the default SIMD width and group of
109 * channel enable signals, inheriting other code generation parameters
110 * from this.
111 *
112 * \p n gives the default SIMD width, \p i gives the slot group used for
113 * predication and control flow masking in multiples of \p n channels.
114 */
115 fs_builder
116 group(unsigned n, unsigned i) const
117 {
118 assert(force_writemask_all ||
119 (n <= dispatch_width() && i < dispatch_width() / n));
120 fs_builder bld = *this;
121 bld._dispatch_width = n;
122 bld._group += i * n;
123 return bld;
124 }
125
126 /**
127 * Alias for group() with width equal to eight.
128 */
129 fs_builder
130 half(unsigned i) const
131 {
132 return group(8, i);
133 }
134
135 /**
136 * Construct a builder with per-channel control flow execution masking
137 * disabled if \p b is true. If control flow execution masking is
138 * already disabled this has no effect.
139 */
140 fs_builder
141 exec_all(bool b = true) const
142 {
143 fs_builder bld = *this;
144 if (b)
145 bld.force_writemask_all = true;
146 return bld;
147 }
148
149 /**
150 * Construct a builder with the given debug annotation info.
151 */
152 fs_builder
153 annotate(const char *str, const void *ir = NULL) const
154 {
155 fs_builder bld = *this;
156 bld.annotation.str = str;
157 bld.annotation.ir = ir;
158 return bld;
159 }
160
161 /**
162 * Get the SIMD width in use.
163 */
164 unsigned
165 dispatch_width() const
166 {
167 return _dispatch_width;
168 }
169
170 /**
171 * Allocate a virtual register of natural vector size (one for this IR)
172 * and SIMD width. \p n gives the amount of space to allocate in
173 * dispatch_width units (which is just enough space for one logical
174 * component in this IR).
175 */
176 dst_reg
177 vgrf(enum brw_reg_type type, unsigned n = 1) const
178 {
179 assert(dispatch_width() <= 32);
180
181 if (n > 0)
182 return dst_reg(GRF, shader->alloc.allocate(
183 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
184 REG_SIZE)),
185 type);
186 else
187 return retype(null_reg_ud(), type);
188 }
189
190 /**
191 * Create a null register of floating type.
192 */
193 dst_reg
194 null_reg_f() const
195 {
196 return dst_reg(retype(brw_null_vec(dispatch_width()),
197 BRW_REGISTER_TYPE_F));
198 }
199
200 /**
201 * Create a null register of signed integer type.
202 */
203 dst_reg
204 null_reg_d() const
205 {
206 return dst_reg(retype(brw_null_vec(dispatch_width()),
207 BRW_REGISTER_TYPE_D));
208 }
209
210 /**
211 * Create a null register of unsigned integer type.
212 */
213 dst_reg
214 null_reg_ud() const
215 {
216 return dst_reg(retype(brw_null_vec(dispatch_width()),
217 BRW_REGISTER_TYPE_UD));
218 }
219
220 /**
221 * Get the mask of SIMD channels enabled by dispatch and not yet
222 * disabled by discard.
223 */
224 src_reg
225 sample_mask_reg() const
226 {
227 const bool uses_kill =
228 (shader->stage == MESA_SHADER_FRAGMENT &&
229 ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
230 return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
231 uses_kill ? brw_flag_reg(0, 1) :
232 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
233 }
234
235 /**
236 * Insert an instruction into the program.
237 */
238 instruction *
239 emit(const instruction &inst) const
240 {
241 return emit(new(shader->mem_ctx) instruction(inst));
242 }
243
244 /**
245 * Create and insert a nullary control instruction into the program.
246 */
247 instruction *
248 emit(enum opcode opcode) const
249 {
250 return emit(instruction(opcode, dispatch_width()));
251 }
252
253 /**
254 * Create and insert a nullary instruction into the program.
255 */
256 instruction *
257 emit(enum opcode opcode, const dst_reg &dst) const
258 {
259 return emit(instruction(opcode, dispatch_width(), dst));
260 }
261
262 /**
263 * Create and insert a unary instruction into the program.
264 */
265 instruction *
266 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
267 {
268 switch (opcode) {
269 case SHADER_OPCODE_RCP:
270 case SHADER_OPCODE_RSQ:
271 case SHADER_OPCODE_SQRT:
272 case SHADER_OPCODE_EXP2:
273 case SHADER_OPCODE_LOG2:
274 case SHADER_OPCODE_SIN:
275 case SHADER_OPCODE_COS:
276 return fix_math_instruction(
277 emit(instruction(opcode, dispatch_width(), dst,
278 fix_math_operand(src0))));
279
280 default:
281 return emit(instruction(opcode, dispatch_width(), dst, src0));
282 }
283 }
284
285 /**
286 * Create and insert a binary instruction into the program.
287 */
288 instruction *
289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290 const src_reg &src1) const
291 {
292 switch (opcode) {
293 case SHADER_OPCODE_POW:
294 case SHADER_OPCODE_INT_QUOTIENT:
295 case SHADER_OPCODE_INT_REMAINDER:
296 return fix_math_instruction(
297 emit(instruction(opcode, dispatch_width(), dst,
298 fix_math_operand(src0),
299 fix_math_operand(src1))));
300
301 default:
302 return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
303
304 }
305 }
306
307 /**
308 * Create and insert a ternary instruction into the program.
309 */
310 instruction *
311 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
312 const src_reg &src1, const src_reg &src2) const
313 {
314 switch (opcode) {
315 case BRW_OPCODE_BFE:
316 case BRW_OPCODE_BFI2:
317 case BRW_OPCODE_MAD:
318 case BRW_OPCODE_LRP:
319 return emit(instruction(opcode, dispatch_width(), dst,
320 fix_3src_operand(src0),
321 fix_3src_operand(src1),
322 fix_3src_operand(src2)));
323
324 default:
325 return emit(instruction(opcode, dispatch_width(), dst,
326 src0, src1, src2));
327 }
328 }
329
330 /**
331 * Create and insert an instruction with a variable number of sources
332 * into the program.
333 */
334 instruction *
335 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
336 unsigned n) const
337 {
338 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
339 }
340
341 /**
342 * Insert a preallocated instruction into the program.
343 */
344 instruction *
345 emit(instruction *inst) const
346 {
347 assert(inst->exec_size <= 32);
348 assert(inst->exec_size == dispatch_width() ||
349 force_writemask_all);
350 assert(_group == 0 || _group == 8);
351
352 inst->force_sechalf = (_group == 8);
353 inst->force_writemask_all = force_writemask_all;
354 inst->annotation = annotation.str;
355 inst->ir = annotation.ir;
356
357 if (block)
358 static_cast<instruction *>(cursor)->insert_before(block, inst);
359 else
360 cursor->insert_before(inst);
361
362 return inst;
363 }
364
365 /**
366 * Select \p src0 if the comparison of both sources with the given
367 * conditional mod evaluates to true, otherwise select \p src1.
368 *
369 * Generally useful to get the minimum or maximum of two values.
370 */
371 void
372 emit_minmax(const dst_reg &dst, const src_reg &src0,
373 const src_reg &src1, brw_conditional_mod mod) const
374 {
375 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
376
377 if (shader->devinfo->gen >= 6) {
378 set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
379 fix_unsigned_negate(src1)));
380 } else {
381 CMP(null_reg_d(), src0, src1, mod);
382 set_predicate(BRW_PREDICATE_NORMAL,
383 SEL(dst, src0, src1));
384 }
385 }
386
387 /**
388 * Copy any live channel from \p src to the first channel of the result.
389 */
390 src_reg
391 emit_uniformize(const src_reg &src) const
392 {
393 /* FIXME: We use a vector chan_index and dst to allow constant and
394 * copy propagration to move result all the way into the consuming
395 * instruction (typically a surface index or sampler index for a
396 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
397 * dispatch. Once we teach const/copy propagation about scalars we
398 * should go back to scalar destinations here.
399 */
400 const fs_builder ubld = exec_all();
401 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
402 const dst_reg dst = vgrf(src.type);
403
404 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
405 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
406
407 return src_reg(component(dst, 0));
408 }
409
410 /**
411 * Assorted arithmetic ops.
412 * @{
413 */
414 #define ALU1(op) \
415 instruction * \
416 op(const dst_reg &dst, const src_reg &src0) const \
417 { \
418 return emit(BRW_OPCODE_##op, dst, src0); \
419 }
420
421 #define ALU2(op) \
422 instruction * \
423 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
424 { \
425 return emit(BRW_OPCODE_##op, dst, src0, src1); \
426 }
427
428 #define ALU2_ACC(op) \
429 instruction * \
430 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
431 { \
432 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
433 inst->writes_accumulator = true; \
434 return inst; \
435 }
436
437 #define ALU3(op) \
438 instruction * \
439 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
440 const src_reg &src2) const \
441 { \
442 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
443 }
444
445 ALU2(ADD)
446 ALU2_ACC(ADDC)
447 ALU2(AND)
448 ALU2(ASR)
449 ALU2(AVG)
450 ALU3(BFE)
451 ALU2(BFI1)
452 ALU3(BFI2)
453 ALU1(BFREV)
454 ALU1(CBIT)
455 ALU2(CMPN)
456 ALU3(CSEL)
457 ALU2(DP2)
458 ALU2(DP3)
459 ALU2(DP4)
460 ALU2(DPH)
461 ALU1(F16TO32)
462 ALU1(F32TO16)
463 ALU1(FBH)
464 ALU1(FBL)
465 ALU1(FRC)
466 ALU2(LINE)
467 ALU1(LZD)
468 ALU2(MAC)
469 ALU2_ACC(MACH)
470 ALU3(MAD)
471 ALU1(MOV)
472 ALU2(MUL)
473 ALU1(NOT)
474 ALU2(OR)
475 ALU2(PLN)
476 ALU1(RNDD)
477 ALU1(RNDE)
478 ALU1(RNDU)
479 ALU1(RNDZ)
480 ALU2(SAD2)
481 ALU2_ACC(SADA2)
482 ALU2(SEL)
483 ALU2(SHL)
484 ALU2(SHR)
485 ALU2_ACC(SUBB)
486 ALU2(XOR)
487
488 #undef ALU3
489 #undef ALU2_ACC
490 #undef ALU2
491 #undef ALU1
492 /** @} */
493
494 /**
495 * CMP: Sets the low bit of the destination channels with the result
496 * of the comparison, while the upper bits are undefined, and updates
497 * the flag register with the packed 16 bits of the result.
498 */
499 instruction *
500 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
501 brw_conditional_mod condition) const
502 {
503 /* Take the instruction:
504 *
505 * CMP null<d> src0<f> src1<f>
506 *
507 * Original gen4 does type conversion to the destination type
508 * before comparison, producing garbage results for floating
509 * point comparisons.
510 *
511 * The destination type doesn't matter on newer generations,
512 * so we set the type to match src0 so we can compact the
513 * instruction.
514 */
515 return set_condmod(condition,
516 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
517 fix_unsigned_negate(src0),
518 fix_unsigned_negate(src1)));
519 }
520
521 /**
522 * Gen4 predicated IF.
523 */
524 instruction *
525 IF(brw_predicate predicate) const
526 {
527 return set_predicate(predicate, emit(BRW_OPCODE_IF));
528 }
529
530 /**
531 * Emit a linear interpolation instruction.
532 */
533 instruction *
534 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
535 const src_reg &a) const
536 {
537 if (shader->devinfo->gen >= 6) {
538 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
539 * we need to reorder the operands.
540 */
541 return emit(BRW_OPCODE_LRP, dst, a, y, x);
542
543 } else {
544 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
545 const dst_reg y_times_a = vgrf(dst.type);
546 const dst_reg one_minus_a = vgrf(dst.type);
547 const dst_reg x_times_one_minus_a = vgrf(dst.type);
548
549 MUL(y_times_a, y, a);
550 ADD(one_minus_a, negate(a), src_reg(1.0f));
551 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
552 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
553 }
554 }
555
556 /**
557 * Collect a number of registers in a contiguous range of registers.
558 */
559 instruction *
560 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
561 unsigned sources, unsigned header_size) const
562 {
563 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
564 inst->header_size = header_size;
565 inst->regs_written = header_size +
566 (sources - header_size) * (dispatch_width() / 8);
567
568 return inst;
569 }
570
571 backend_shader *shader;
572
573 private:
574 /**
575 * Workaround for negation of UD registers. See comment in
576 * fs_generator::generate_code() for more details.
577 */
578 src_reg
579 fix_unsigned_negate(const src_reg &src) const
580 {
581 if (src.type == BRW_REGISTER_TYPE_UD &&
582 src.negate) {
583 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
584 MOV(temp, src);
585 return src_reg(temp);
586 } else {
587 return src;
588 }
589 }
590
591 /**
592 * Workaround for source register modes not supported by the ternary
593 * instruction encoding.
594 */
595 src_reg
596 fix_3src_operand(const src_reg &src) const
597 {
598 if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
599 return src;
600 } else {
601 dst_reg expanded = vgrf(src.type);
602 MOV(expanded, src);
603 return expanded;
604 }
605 }
606
607 /**
608 * Workaround for source register modes not supported by the math
609 * instruction.
610 */
611 src_reg
612 fix_math_operand(const src_reg &src) const
613 {
614 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
615 * might be able to do better by doing execsize = 1 math and then
616 * expanding that result out, but we would need to be careful with
617 * masking.
618 *
619 * Gen6 hardware ignores source modifiers (negate and abs) on math
620 * instructions, so we also move to a temp to set those up.
621 *
622 * Gen7 relaxes most of the above restrictions, but still can't use IMM
623 * operands to math
624 */
625 if ((shader->devinfo->gen == 6 &&
626 (src.file == IMM || src.file == UNIFORM ||
627 src.abs || src.negate)) ||
628 (shader->devinfo->gen == 7 && src.file == IMM)) {
629 const dst_reg tmp = vgrf(src.type);
630 MOV(tmp, src);
631 return tmp;
632 } else {
633 return src;
634 }
635 }
636
637 /**
638 * Workaround other weirdness of the math instruction.
639 */
640 instruction *
641 fix_math_instruction(instruction *inst) const
642 {
643 if (shader->devinfo->gen < 6) {
644 inst->base_mrf = 2;
645 inst->mlen = inst->sources * dispatch_width() / 8;
646
647 if (inst->sources > 1) {
648 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
649 * "Message Payload":
650 *
651 * "Operand0[7]. For the INT DIV functions, this operand is the
652 * denominator."
653 * ...
654 * "Operand1[7]. For the INT DIV functions, this operand is the
655 * numerator."
656 */
657 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
658 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
659 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
660
661 inst->resize_sources(1);
662 inst->src[0] = src0;
663
664 at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
665 src1);
666 }
667 }
668
669 return inst;
670 }
671
672 bblock_t *block;
673 exec_node *cursor;
674
675 unsigned _dispatch_width;
676 unsigned _group;
677 bool force_writemask_all;
678
679 /** Debug annotation info. */
680 struct {
681 const char *str;
682 const void *ir;
683 } annotation;
684 };
685 }
686
687 #endif