i965: Fix DP write channel ordering on Sandybridge.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_optimize.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "main/macros.h"
29 #include "program/program.h"
30 #include "program/prog_print.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_eu.h"
34
35 static const struct {
36 char *name;
37 int nsrc;
38 int ndst;
39 GLboolean is_arith;
40 } inst_opcode[128] = {
41 [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
42 [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
43 [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
44 [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
45 [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
46 [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
47 [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
48 [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
49
50 [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
51 [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
52 [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
53 [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
54 [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
55 [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
56 [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
57 [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
58 [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
59 [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
60 [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
61 [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
62
63 [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
64 [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
65 [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
66 [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
67 [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
68 [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
69 [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
70 [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
71 [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
72 [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
73 [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
74
75 [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
76 [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
77 [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
78 [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
79 [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
80 [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
81 [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
82 [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
83 [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
84 [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
85 [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
86 [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
87 [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
88 [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
89 [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
90 [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
91 [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
92 };
93
94 static INLINE
95 GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
96 {
97 return inst_opcode[inst->header.opcode].is_arith;
98 }
99
100 static const GLuint inst_stride[7] = {
101 [0] = 0,
102 [1] = 1,
103 [2] = 2,
104 [3] = 4,
105 [4] = 8,
106 [5] = 16,
107 [6] = 32
108 };
109
110 static const GLuint inst_type_size[8] = {
111 [BRW_REGISTER_TYPE_UD] = 4,
112 [BRW_REGISTER_TYPE_D] = 4,
113 [BRW_REGISTER_TYPE_UW] = 2,
114 [BRW_REGISTER_TYPE_W] = 2,
115 [BRW_REGISTER_TYPE_UB] = 1,
116 [BRW_REGISTER_TYPE_B] = 1,
117 [BRW_REGISTER_TYPE_F] = 4
118 };
119
120 static INLINE GLboolean
121 brw_is_grf_written(const struct brw_instruction *inst,
122 int reg_index, int size,
123 int gen)
124 {
125 if (inst_opcode[inst->header.opcode].ndst == 0)
126 return GL_FALSE;
127
128 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
129 if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
130 return GL_TRUE;
131
132 if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
133 return GL_FALSE;
134
135 const int reg_start = reg_index * REG_SIZE;
136 const int reg_end = reg_start + size;
137
138 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
139 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
140 + inst->bits1.da1.dest_subreg_nr;
141 int length, write_end;
142
143 /* SEND is specific */
144 if (inst->header.opcode == BRW_OPCODE_SEND) {
145 if (gen >= 5)
146 length = inst->bits3.generic_gen5.response_length*REG_SIZE;
147 else
148 length = inst->bits3.generic.response_length*REG_SIZE;
149 }
150 else {
151 length = 1 << inst->header.execution_size;
152 length *= type_size;
153 length *= inst->bits1.da1.dest_horiz_stride;
154 }
155
156 /* If the two intervals intersect, we overwrite the register */
157 write_end = write_start + length;
158 const int left = MAX2(write_start, reg_start);
159 const int right = MIN2(write_end, reg_end);
160
161 return left < right;
162 }
163
164 /* Specific path for message register since we need to handle the compr4 case */
165 static INLINE GLboolean
166 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
167 {
168 if (inst_opcode[inst->header.opcode].ndst == 0)
169 return GL_FALSE;
170
171 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
172 if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
173 return GL_TRUE;
174
175 if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
176 return GL_FALSE;
177
178 const int reg_start = reg_index * REG_SIZE;
179 const int reg_end = reg_start + size;
180
181 const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
182 const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
183 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
184
185 /* We use compr4 with a size != 16 elements. Strange, we conservatively
186 * consider that we are writing the register.
187 */
188 if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
189 return GL_TRUE;
190
191 GLboolean is_written = GL_FALSE;
192
193 /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
194 if (is_compr4) {
195 const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
196
197 /* First 8-way register */
198 const int write_start0 = mrf_index*REG_SIZE
199 + inst->bits1.da1.dest_subreg_nr;
200 const int write_end0 = write_start0 + length;
201
202 /* Second 8-way register */
203 const int write_start1 = (mrf_index+4)*REG_SIZE
204 + inst->bits1.da1.dest_subreg_nr;
205 const int write_end1 = write_start1 + length;
206
207 /* If the two intervals intersect, we overwrite the register */
208 const int left0 = MAX2(write_start0, reg_start);
209 const int right0 = MIN2(write_end0, reg_end);
210 const int left1 = MAX2(write_start1, reg_start);
211 const int right1 = MIN2(write_end1, reg_end);
212
213 is_written = left0 < right0 || left1 < right1;
214 }
215 else {
216 int length;
217 length = 1 << inst->header.execution_size;
218 length *= type_size;
219 length *= inst->bits1.da1.dest_horiz_stride;
220
221 /* If the two intervals intersect, we write into the register */
222 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
223 + inst->bits1.da1.dest_subreg_nr;
224 const int write_end = write_start + length;
225 const int left = MAX2(write_start, reg_start);
226 const int right = MIN2(write_end, reg_end);;
227
228 is_written = left < right;
229 }
230
231 /* SEND may perform an implicit mov to a mrf register */
232 if (is_written == GL_FALSE &&
233 inst->header.opcode == BRW_OPCODE_SEND &&
234 inst->bits1.da1.src0_reg_file != 0) {
235
236 const int mrf_start = inst->header.destreg__conditionalmod;
237 const int write_start = mrf_start * REG_SIZE;
238 const int write_end = write_start + REG_SIZE;
239 const int left = MAX2(write_start, reg_start);
240 const int right = MIN2(write_end, reg_end);;
241 is_written = left < right;
242 }
243
244 return is_written;
245 }
246
247 static INLINE GLboolean
248 brw_is_mrf_read(const struct brw_instruction *inst,
249 int reg_index, int size, int gen)
250 {
251 if (inst->header.opcode != BRW_OPCODE_SEND)
252 return GL_FALSE;
253 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
254 return GL_TRUE;
255
256 const int reg_start = reg_index*REG_SIZE;
257 const int reg_end = reg_start + size;
258
259 int length, read_start, read_end;
260 if (gen >= 5)
261 length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
262 else
263 length = inst->bits3.generic.msg_length*REG_SIZE;
264
265 /* Look if SEND uses an implicit mov. In that case, we read one less register
266 * (but we write it)
267 */
268 if (inst->bits1.da1.src0_reg_file != 0)
269 read_start = inst->header.destreg__conditionalmod;
270 else {
271 length--;
272 read_start = inst->header.destreg__conditionalmod + 1;
273 }
274 read_start *= REG_SIZE;
275 read_end = read_start + length;
276
277 const int left = MAX2(read_start, reg_start);
278 const int right = MIN2(read_end, reg_end);
279
280 return left < right;
281 }
282
283 static INLINE GLboolean
284 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
285 {
286 int i, j;
287 if (inst_opcode[inst->header.opcode].nsrc == 0)
288 return GL_FALSE;
289
290 /* Look at first source. We must take into account register regions to
291 * monitor carefully the read. Note that we are a bit too conservative here
292 * since we do not take into account the fact that some complete registers
293 * may be skipped
294 */
295 if (inst_opcode[inst->header.opcode].nsrc >= 1) {
296
297 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
298 if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
299 return GL_TRUE;
300 if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
301 return GL_FALSE;
302
303 const int reg_start = reg_index*REG_SIZE;
304 const int reg_end = reg_start + size;
305
306 /* See if at least one of this element intersects the interval */
307 const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
308 const int elem_num = 1 << inst->header.execution_size;
309 const int width = 1 << inst->bits2.da1.src0_width;
310 const int row_num = elem_num >> inst->bits2.da1.src0_width;
311 const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
312 const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
313 int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
314 + inst->bits2.da1.src0_subreg_nr;
315 for (j = 0; j < row_num; ++j) {
316 int write_start = row_start;
317 for (i = 0; i < width; ++i) {
318 const int write_end = write_start + type_size;
319 const int left = write_start > reg_start ? write_start : reg_start;
320 const int right = write_end < reg_end ? write_end : reg_end;
321 if (left < right)
322 return GL_TRUE;
323 write_start += hs;
324 }
325 row_start += vs;
326 }
327 }
328
329 /* Second src register */
330 if (inst_opcode[inst->header.opcode].nsrc >= 2) {
331
332 if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
333 if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
334 return GL_TRUE;
335 if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
336 return GL_FALSE;
337
338 const int reg_start = reg_index*REG_SIZE;
339 const int reg_end = reg_start + size;
340
341 /* See if at least one of this element intersects the interval */
342 const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
343 const int elem_num = 1 << inst->header.execution_size;
344 const int width = 1 << inst->bits3.da1.src1_width;
345 const int row_num = elem_num >> inst->bits3.da1.src1_width;
346 const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
347 const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
348 int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
349 + inst->bits3.da1.src1_subreg_nr;
350 for (j = 0; j < row_num; ++j) {
351 int write_start = row_start;
352 for (i = 0; i < width; ++i) {
353 const int write_end = write_start + type_size;
354 const int left = write_start > reg_start ? write_start : reg_start;
355 const int right = write_end < reg_end ? write_end : reg_end;
356 if (left < right)
357 return GL_TRUE;
358 write_start += hs;
359 }
360 row_start += vs;
361 }
362 }
363
364 return GL_FALSE;
365 }
366
367 static INLINE GLboolean
368 brw_is_control_done(const struct brw_instruction *mov) {
369 return
370 mov->header.dependency_control != 0 ||
371 mov->header.thread_control != 0 ||
372 mov->header.mask_control != 0 ||
373 mov->header.saturate != 0 ||
374 mov->header.debug_control != 0;
375 }
376
377 static INLINE GLboolean
378 brw_is_predicated(const struct brw_instruction *mov) {
379 return mov->header.predicate_control != 0;
380 }
381
382 static INLINE GLboolean
383 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
384 int *mrf_index,
385 int *grf_index,
386 GLboolean *is_compr4)
387 {
388 if (brw_is_predicated(mov) ||
389 brw_is_control_done(mov) ||
390 mov->header.debug_control != 0)
391 return GL_FALSE;
392
393 if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
394 mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
395 mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
396 mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
397 mov->bits1.da1.dest_subreg_nr != 0)
398 return GL_FALSE;
399
400 if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
401 mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
402 mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
403 mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
404 mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
405 mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
406 mov->bits2.da1.src0_subreg_nr != 0 ||
407 mov->bits2.da1.src0_abs != 0 ||
408 mov->bits2.da1.src0_negate != 0)
409 return GL_FALSE;
410
411 *grf_index = mov->bits2.da1.src0_reg_nr;
412 *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
413 *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
414 return GL_TRUE;
415 }
416
417 static INLINE GLboolean
418 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
419 {
420 /* remark: no problem to predicate a SEL instruction */
421 if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
422 brw_is_control_done(inst) == GL_FALSE &&
423 inst->header.execution_size == 4 &&
424 inst->header.access_mode == BRW_ALIGN_1 &&
425 inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
426 inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
427 inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
428 inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
429 inst->bits1.da1.dest_reg_nr == grf_index &&
430 inst->bits1.da1.dest_subreg_nr == 0 &&
431 brw_is_arithmetic_inst(inst))
432 return GL_TRUE;
433
434 return GL_FALSE;
435 }
436
437 static INLINE GLboolean
438 brw_inst_are_equal(const struct brw_instruction *src0,
439 const struct brw_instruction *src1)
440 {
441 const GLuint *field0 = (GLuint *) src0;
442 const GLuint *field1 = (GLuint *) src1;
443 return field0[0] == field1[0] &&
444 field0[1] == field1[1] &&
445 field0[2] == field1[2] &&
446 field0[3] == field1[3];
447 }
448
449 static INLINE void
450 brw_inst_copy(struct brw_instruction *dst,
451 const struct brw_instruction *src)
452 {
453 GLuint *field_dst = (GLuint *) dst;
454 const GLuint *field_src = (GLuint *) src;
455 field_dst[0] = field_src[0];
456 field_dst[1] = field_src[1];
457 field_dst[2] = field_src[2];
458 field_dst[3] = field_src[3];
459 }
460
461 static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
462 {
463 int i, nr_insn = 0, to = 0, from = 0;
464
465 for (from = 0; from < p->nr_insn; ++from) {
466 if (removeInst[from])
467 continue;
468 if(to != from)
469 brw_inst_copy(p->store + to, p->store + from);
470 to++;
471 }
472
473 for (i = 0; i < p->nr_insn; ++i)
474 if (removeInst[i] == GL_FALSE)
475 nr_insn++;
476 p->nr_insn = nr_insn;
477 }
478
479 /* The gen code emitter generates a lot of duplications in the
480 * grf-to-mrf moves, for example when texture sampling with the same
481 * coordinates from multiple textures.. Here, we monitor same mov
482 * grf-to-mrf instrutions and remove repeated ones where the operands
483 * and dst ahven't changed in between.
484 */
485 void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
486 {
487 const int gen = p->brw->intel.gen;
488 int i, j;
489
490 GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
491 for (i = 0; i < p->nr_insn; i++) {
492 if (removeInst[i])
493 continue;
494
495 const struct brw_instruction *mov = p->store + i;
496 int mrf_index, grf_index;
497 GLboolean is_compr4;
498
499 /* Only consider _straight_ grf-to-mrf moves */
500 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
501 continue;
502
503 const int mrf_index0 = mrf_index;
504 const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
505 const int simd16_size = 2 * REG_SIZE;
506
507 for (j = i + 1; j < p->nr_insn; j++) {
508 const struct brw_instruction *inst = p->store + j;
509
510 if (brw_inst_are_equal(mov, inst)) {
511 removeInst[j] = GL_TRUE;
512 continue;
513 }
514
515 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
516 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
517 brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
518 break;
519 }
520 }
521
522 brw_remove_inst(p, removeInst);
523 free(removeInst);
524 }
525
526 /* Replace moves to MRFs where the value moved is the result of a
527 * normal arithmetic operation with computation right into the MRF.
528 */
529 void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
530 {
531 int i, j, prev;
532 struct brw_context *brw = p->brw;
533 const int gen = brw->intel.gen;
534 const int simd16_size = 2*REG_SIZE;
535
536 GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
537 assert(removeInst);
538
539 for (i = 0; i < p->nr_insn; i++) {
540 if (removeInst[i])
541 continue;
542
543 struct brw_instruction *grf_inst = NULL;
544 const struct brw_instruction *mov = p->store + i;
545 int mrf_index, grf_index;
546 GLboolean is_compr4;
547
548 /* Only consider _straight_ grf-to-mrf moves */
549 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
550 continue;
551
552 /* Using comp4 enables a stride of 4 for this instruction */
553 const int mrf_index0 = mrf_index;
554 const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
555
556 /* Look where the register has been set */
557 prev = i;
558 GLboolean potential_remove = GL_FALSE;
559 while (prev--) {
560
561 /* If _one_ instruction writes the grf, we try to remove the mov */
562 struct brw_instruction *inst = p->store + prev;
563 if (brw_is_grf_straight_write(inst, grf_index)) {
564 potential_remove = GL_TRUE;
565 grf_inst = inst;
566 break;
567 }
568
569 }
570
571 if (potential_remove == GL_FALSE)
572 continue;
573 removeInst[i] = GL_TRUE;
574
575 /* Monitor first the section of code between the grf computation and the
576 * mov. Here we cannot read or write both mrf and grf register
577 */
578 for (j = prev + 1; j < i; ++j) {
579 struct brw_instruction *inst = p->store + j;
580 if (removeInst[j])
581 continue;
582 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
583 brw_is_grf_read(inst, grf_index, simd16_size) ||
584 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
585 brw_is_mrf_written(inst, mrf_index1, REG_SIZE) ||
586 brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
587 brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
588 removeInst[i] = GL_FALSE;
589 break;
590 }
591 }
592
593 /* After the mov, we can read or write the mrf. If the grf is overwritten,
594 * we are done
595 */
596 for (j = i + 1; j < p->nr_insn; ++j) {
597 struct brw_instruction *inst = p->store + j;
598 if (removeInst[j])
599 continue;
600
601 if (brw_is_grf_read(inst, grf_index, simd16_size)) {
602 removeInst[i] = GL_FALSE;
603 break;
604 }
605
606 if (brw_is_grf_straight_write(inst, grf_index))
607 break;
608 }
609
610 /* Note that with the top down traversal, we can safely pacth the mov
611 * instruction
612 */
613 if (removeInst[i]) {
614 grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
615 grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
616 }
617 }
618
619 brw_remove_inst(p, removeInst);
620 free(removeInst);
621 }
622
623 static GLboolean
624 is_single_channel_dp4(struct brw_instruction *insn)
625 {
626 if (insn->header.opcode != BRW_OPCODE_DP4 ||
627 insn->header.execution_size != BRW_EXECUTE_8 ||
628 insn->header.access_mode != BRW_ALIGN_16 ||
629 insn->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
630 return GL_FALSE;
631
632 if (!is_power_of_two(insn->bits1.da16.dest_writemask))
633 return GL_FALSE;
634
635 return GL_TRUE;
636 }
637
638 /**
639 * Sets the dependency control fields on DP4 instructions.
640 *
641 * The hardware only tracks dependencies on a register basis, so when
642 * you do:
643 *
644 * DP4 dst.x src1 src2
645 * DP4 dst.y src1 src3
646 * DP4 dst.z src1 src4
647 * DP4 dst.w src1 src5
648 *
649 * It will wait to do the DP4 dst.y until the dst.x is resolved, etc.
650 * We can examine our instruction stream and set the dependency
651 * control fields to tell the hardware when to do it.
652 *
653 * We may want to extend this to other instructions that are used to
654 * fill in a channel at a time of the destination register.
655 */
656 static void
657 brw_set_dp4_dependency_control(struct brw_compile *p)
658 {
659 int i;
660
661 for (i = 1; i < p->nr_insn; i++) {
662 struct brw_instruction *insn = &p->store[i];
663 struct brw_instruction *prev = &p->store[i - 1];
664
665 if (!is_single_channel_dp4(prev))
666 continue;
667
668 if (!is_single_channel_dp4(insn)) {
669 i++;
670 continue;
671 }
672
673 /* Only avoid hw dep control if the write masks are different
674 * channels of one reg.
675 */
676 if (insn->bits1.da16.dest_writemask == prev->bits1.da16.dest_writemask)
677 continue;
678 if (insn->bits1.da16.dest_reg_nr != prev->bits1.da16.dest_reg_nr)
679 continue;
680
681 /* Check if the second instruction depends on the previous one
682 * for a src.
683 */
684 if (insn->bits1.da1.src0_reg_file == BRW_GENERAL_REGISTER_FILE &&
685 (insn->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
686 insn->bits2.da1.src0_reg_nr == insn->bits1.da16.dest_reg_nr))
687 continue;
688 if (insn->bits1.da1.src1_reg_file == BRW_GENERAL_REGISTER_FILE &&
689 (insn->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT ||
690 insn->bits3.da1.src1_reg_nr == insn->bits1.da16.dest_reg_nr))
691 continue;
692
693 prev->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
694 insn->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
695 }
696 }
697
698 void
699 brw_optimize(struct brw_compile *p)
700 {
701 brw_set_dp4_dependency_control(p);
702 }