i965/vec4: Fix handling of multiple register reads and writes in split_virtual_grfs().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_channel_expressions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file brw_wm_channel_expressions.cpp
26 *
27 * Breaks vector operations down into operations on each component.
28 *
29 * The 965 fragment shader receives 8 or 16 pixels at a time, so each
30 * channel of a vector is laid out as 1 or 2 8-float registers. Each
31 * ALU operation operates on one of those channel registers. As a
32 * result, there is no value to the 965 fragment shader in tracking
33 * "vector" expressions in the sense of GLSL fragment shaders, when
34 * doing a channel at a time may help in constant folding, algebraic
35 * simplification, and reducing the liveness of channel registers.
36 *
37 * The exception to the desire to break everything down to floats is
38 * texturing. The texture sampler returns a writemasked masked
39 * 4/8-register sequence containing the texture values. We don't want
40 * to dispatch to the sampler separately for each channel we need, so
41 * we do retain the vector types in that case.
42 */
43
44 #include "main/core.h"
45 #include "brw_wm.h"
46 #include "glsl/ir.h"
47 #include "glsl/ir_expression_flattening.h"
48 #include "glsl/glsl_types.h"
49
50 class ir_channel_expressions_visitor : public ir_hierarchical_visitor {
51 public:
52 ir_channel_expressions_visitor()
53 {
54 this->progress = false;
55 this->mem_ctx = NULL;
56 }
57
58 ir_visitor_status visit_leave(ir_assignment *);
59
60 ir_rvalue *get_element(ir_variable *var, unsigned int element);
61 void assign(ir_assignment *ir, int elem, ir_rvalue *val);
62
63 bool progress;
64 void *mem_ctx;
65 };
66
67 static bool
68 channel_expressions_predicate(ir_instruction *ir)
69 {
70 ir_expression *expr = ir->as_expression();
71 unsigned int i;
72
73 if (!expr)
74 return false;
75
76 switch (expr->operation) {
77 /* these opcodes need to act on the whole vector,
78 * just like texturing.
79 */
80 case ir_unop_interpolate_at_centroid:
81 case ir_binop_interpolate_at_offset:
82 case ir_binop_interpolate_at_sample:
83 return false;
84 default:
85 break;
86 }
87
88 for (i = 0; i < expr->get_num_operands(); i++) {
89 if (expr->operands[i]->type->is_vector())
90 return true;
91 }
92
93 return false;
94 }
95
96 bool
97 brw_do_channel_expressions(exec_list *instructions)
98 {
99 ir_channel_expressions_visitor v;
100
101 /* Pull out any matrix expression to a separate assignment to a
102 * temp. This will make our handling of the breakdown to
103 * operations on the matrix's vector components much easier.
104 */
105 do_expression_flattening(instructions, channel_expressions_predicate);
106
107 visit_list_elements(&v, instructions);
108
109 return v.progress;
110 }
111
112 ir_rvalue *
113 ir_channel_expressions_visitor::get_element(ir_variable *var, unsigned int elem)
114 {
115 ir_dereference *deref;
116
117 if (var->type->is_scalar())
118 return new(mem_ctx) ir_dereference_variable(var);
119
120 assert(elem < var->type->components());
121 deref = new(mem_ctx) ir_dereference_variable(var);
122 return new(mem_ctx) ir_swizzle(deref, elem, 0, 0, 0, 1);
123 }
124
125 void
126 ir_channel_expressions_visitor::assign(ir_assignment *ir, int elem, ir_rvalue *val)
127 {
128 ir_dereference *lhs = ir->lhs->clone(mem_ctx, NULL);
129 ir_assignment *assign;
130
131 /* This assign-of-expression should have been generated by the
132 * expression flattening visitor (since we never short circit to
133 * not flatten, even for plain assignments of variables), so the
134 * writemask is always full.
135 */
136 assert(ir->write_mask == (1 << ir->lhs->type->components()) - 1);
137
138 assign = new(mem_ctx) ir_assignment(lhs, val, NULL, (1 << elem));
139 ir->insert_before(assign);
140 }
141
142 ir_visitor_status
143 ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
144 {
145 ir_expression *expr = ir->rhs->as_expression();
146 bool found_vector = false;
147 unsigned int i, vector_elements = 1;
148 ir_variable *op_var[3];
149
150 if (!expr)
151 return visit_continue;
152
153 if (!this->mem_ctx)
154 this->mem_ctx = ralloc_parent(ir);
155
156 for (i = 0; i < expr->get_num_operands(); i++) {
157 if (expr->operands[i]->type->is_vector()) {
158 found_vector = true;
159 vector_elements = expr->operands[i]->type->vector_elements;
160 break;
161 }
162 }
163 if (!found_vector)
164 return visit_continue;
165
166 switch (expr->operation) {
167 case ir_unop_interpolate_at_centroid:
168 case ir_binop_interpolate_at_offset:
169 case ir_binop_interpolate_at_sample:
170 return visit_continue;
171
172 default:
173 break;
174 }
175
176 /* Store the expression operands in temps so we can use them
177 * multiple times.
178 */
179 for (i = 0; i < expr->get_num_operands(); i++) {
180 ir_assignment *assign;
181 ir_dereference *deref;
182
183 assert(!expr->operands[i]->type->is_matrix());
184
185 op_var[i] = new(mem_ctx) ir_variable(expr->operands[i]->type,
186 "channel_expressions",
187 ir_var_temporary);
188 ir->insert_before(op_var[i]);
189
190 deref = new(mem_ctx) ir_dereference_variable(op_var[i]);
191 assign = new(mem_ctx) ir_assignment(deref,
192 expr->operands[i],
193 NULL);
194 ir->insert_before(assign);
195 }
196
197 const glsl_type *element_type = glsl_type::get_instance(ir->lhs->type->base_type,
198 1, 1);
199
200 /* OK, time to break down this vector operation. */
201 switch (expr->operation) {
202 case ir_unop_bit_not:
203 case ir_unop_logic_not:
204 case ir_unop_neg:
205 case ir_unop_abs:
206 case ir_unop_sign:
207 case ir_unop_rcp:
208 case ir_unop_rsq:
209 case ir_unop_sqrt:
210 case ir_unop_exp:
211 case ir_unop_log:
212 case ir_unop_exp2:
213 case ir_unop_log2:
214 case ir_unop_bitcast_i2f:
215 case ir_unop_bitcast_f2i:
216 case ir_unop_bitcast_f2u:
217 case ir_unop_bitcast_u2f:
218 case ir_unop_i2u:
219 case ir_unop_u2i:
220 case ir_unop_f2i:
221 case ir_unop_f2u:
222 case ir_unop_i2f:
223 case ir_unop_f2b:
224 case ir_unop_b2f:
225 case ir_unop_i2b:
226 case ir_unop_b2i:
227 case ir_unop_u2f:
228 case ir_unop_trunc:
229 case ir_unop_ceil:
230 case ir_unop_floor:
231 case ir_unop_fract:
232 case ir_unop_round_even:
233 case ir_unop_sin:
234 case ir_unop_cos:
235 case ir_unop_sin_reduced:
236 case ir_unop_cos_reduced:
237 case ir_unop_dFdx:
238 case ir_unop_dFdx_coarse:
239 case ir_unop_dFdx_fine:
240 case ir_unop_dFdy:
241 case ir_unop_dFdy_coarse:
242 case ir_unop_dFdy_fine:
243 case ir_unop_bitfield_reverse:
244 case ir_unop_bit_count:
245 case ir_unop_find_msb:
246 case ir_unop_find_lsb:
247 case ir_unop_saturate:
248 for (i = 0; i < vector_elements; i++) {
249 ir_rvalue *op0 = get_element(op_var[0], i);
250
251 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
252 element_type,
253 op0,
254 NULL));
255 }
256 break;
257
258 case ir_binop_add:
259 case ir_binop_sub:
260 case ir_binop_mul:
261 case ir_binop_imul_high:
262 case ir_binop_div:
263 case ir_binop_carry:
264 case ir_binop_borrow:
265 case ir_binop_mod:
266 case ir_binop_min:
267 case ir_binop_max:
268 case ir_binop_pow:
269 case ir_binop_lshift:
270 case ir_binop_rshift:
271 case ir_binop_bit_and:
272 case ir_binop_bit_xor:
273 case ir_binop_bit_or:
274 case ir_binop_less:
275 case ir_binop_greater:
276 case ir_binop_lequal:
277 case ir_binop_gequal:
278 case ir_binop_equal:
279 case ir_binop_nequal:
280 for (i = 0; i < vector_elements; i++) {
281 ir_rvalue *op0 = get_element(op_var[0], i);
282 ir_rvalue *op1 = get_element(op_var[1], i);
283
284 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
285 element_type,
286 op0,
287 op1));
288 }
289 break;
290
291 case ir_unop_any: {
292 ir_expression *temp;
293 temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
294 element_type,
295 get_element(op_var[0], 0),
296 get_element(op_var[0], 1));
297
298 for (i = 2; i < vector_elements; i++) {
299 temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
300 element_type,
301 get_element(op_var[0], i),
302 temp);
303 }
304 assign(ir, 0, temp);
305 break;
306 }
307
308 case ir_binop_dot: {
309 ir_expression *last = NULL;
310 for (i = 0; i < vector_elements; i++) {
311 ir_rvalue *op0 = get_element(op_var[0], i);
312 ir_rvalue *op1 = get_element(op_var[1], i);
313 ir_expression *temp;
314
315 temp = new(mem_ctx) ir_expression(ir_binop_mul,
316 element_type,
317 op0,
318 op1);
319 if (last) {
320 last = new(mem_ctx) ir_expression(ir_binop_add,
321 element_type,
322 temp,
323 last);
324 } else {
325 last = temp;
326 }
327 }
328 assign(ir, 0, last);
329 break;
330 }
331
332 case ir_binop_logic_and:
333 case ir_binop_logic_xor:
334 case ir_binop_logic_or:
335 ir->fprint(stderr);
336 fprintf(stderr, "\n");
337 unreachable("not reached: expression operates on scalars only");
338 case ir_binop_all_equal:
339 case ir_binop_any_nequal: {
340 ir_expression *last = NULL;
341 for (i = 0; i < vector_elements; i++) {
342 ir_rvalue *op0 = get_element(op_var[0], i);
343 ir_rvalue *op1 = get_element(op_var[1], i);
344 ir_expression *temp;
345 ir_expression_operation join;
346
347 if (expr->operation == ir_binop_all_equal)
348 join = ir_binop_logic_and;
349 else
350 join = ir_binop_logic_or;
351
352 temp = new(mem_ctx) ir_expression(expr->operation,
353 element_type,
354 op0,
355 op1);
356 if (last) {
357 last = new(mem_ctx) ir_expression(join,
358 element_type,
359 temp,
360 last);
361 } else {
362 last = temp;
363 }
364 }
365 assign(ir, 0, last);
366 break;
367 }
368 case ir_unop_noise:
369 unreachable("noise should have been broken down to function call");
370
371 case ir_binop_bfm: {
372 /* Does not need to be scalarized, since its result will be identical
373 * for all channels.
374 */
375 ir_rvalue *op0 = get_element(op_var[0], 0);
376 ir_rvalue *op1 = get_element(op_var[1], 0);
377
378 assign(ir, 0, new(mem_ctx) ir_expression(expr->operation,
379 element_type,
380 op0,
381 op1));
382 break;
383 }
384
385 case ir_binop_ubo_load:
386 unreachable("not yet supported");
387
388 case ir_triop_fma:
389 case ir_triop_lrp:
390 case ir_triop_csel:
391 case ir_triop_bitfield_extract:
392 for (i = 0; i < vector_elements; i++) {
393 ir_rvalue *op0 = get_element(op_var[0], i);
394 ir_rvalue *op1 = get_element(op_var[1], i);
395 ir_rvalue *op2 = get_element(op_var[2], i);
396
397 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
398 element_type,
399 op0,
400 op1,
401 op2));
402 }
403 break;
404
405 case ir_triop_bfi: {
406 /* Only a single BFM is needed for multiple BFIs. */
407 ir_rvalue *op0 = get_element(op_var[0], 0);
408
409 for (i = 0; i < vector_elements; i++) {
410 ir_rvalue *op1 = get_element(op_var[1], i);
411 ir_rvalue *op2 = get_element(op_var[2], i);
412
413 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
414 element_type,
415 op0->clone(mem_ctx, NULL),
416 op1,
417 op2));
418 }
419 break;
420 }
421
422 case ir_unop_pack_snorm_2x16:
423 case ir_unop_pack_snorm_4x8:
424 case ir_unop_pack_unorm_2x16:
425 case ir_unop_pack_unorm_4x8:
426 case ir_unop_pack_half_2x16:
427 case ir_unop_unpack_snorm_2x16:
428 case ir_unop_unpack_snorm_4x8:
429 case ir_unop_unpack_unorm_2x16:
430 case ir_unop_unpack_unorm_4x8:
431 case ir_unop_unpack_half_2x16:
432 case ir_binop_ldexp:
433 case ir_binop_vector_extract:
434 case ir_triop_vector_insert:
435 case ir_quadop_bitfield_insert:
436 case ir_quadop_vector:
437 unreachable("should have been lowered");
438
439 case ir_unop_unpack_half_2x16_split_x:
440 case ir_unop_unpack_half_2x16_split_y:
441 case ir_binop_pack_half_2x16_split:
442 case ir_unop_interpolate_at_centroid:
443 case ir_binop_interpolate_at_offset:
444 case ir_binop_interpolate_at_sample:
445 unreachable("not reached: expression operates on scalars only");
446
447 case ir_unop_pack_double_2x32:
448 case ir_unop_unpack_double_2x32:
449 case ir_unop_frexp_sig:
450 case ir_unop_frexp_exp:
451 case ir_unop_d2f:
452 case ir_unop_f2d:
453 case ir_unop_d2i:
454 case ir_unop_i2d:
455 case ir_unop_d2u:
456 case ir_unop_u2d:
457 case ir_unop_d2b:
458 unreachable("no fp64 support yet");
459 }
460
461 ir->remove();
462 this->progress = true;
463
464 return visit_continue;
465 }