39ea95f6fc2badfe0302a04aadca701db172c403
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13
14 /**
15 * Determine if the given fragment program uses GLSL features such
16 * as flow conditionals, loops, subroutines.
17 * Some GLSL shaders may use these features, others might not.
18 */
19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
20 {
21 int i;
22 for (i = 0; i < fp->Base.NumInstructions; i++) {
23 const struct prog_instruction *inst = &fp->Base.Instructions[i];
24 switch (inst->Opcode) {
25 case OPCODE_IF:
26 case OPCODE_TRUNC:
27 case OPCODE_ENDIF:
28 case OPCODE_CAL:
29 case OPCODE_BRK:
30 case OPCODE_RET:
31 case OPCODE_DDX:
32 case OPCODE_DDY:
33 case OPCODE_NOISE1:
34 case OPCODE_NOISE2:
35 case OPCODE_NOISE3:
36 case OPCODE_NOISE4:
37 case OPCODE_BGNLOOP:
38 return GL_TRUE;
39 default:
40 break;
41 }
42 }
43 return GL_FALSE;
44 }
45
46
47
48 static void
49 reclaim_temps(struct brw_wm_compile *c);
50
51
52 /** Mark GRF register as used. */
53 static void
54 prealloc_grf(struct brw_wm_compile *c, int r)
55 {
56 c->used_grf[r] = GL_TRUE;
57 }
58
59
60 /** Mark given GRF register as not in use. */
61 static void
62 release_grf(struct brw_wm_compile *c, int r)
63 {
64 /*assert(c->used_grf[r]);*/
65 c->used_grf[r] = GL_FALSE;
66 c->first_free_grf = MIN2(c->first_free_grf, r);
67 }
68
69
70 /** Return index of a free GRF, mark it as used. */
71 static int
72 alloc_grf(struct brw_wm_compile *c)
73 {
74 GLuint r;
75 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
76 if (!c->used_grf[r]) {
77 c->used_grf[r] = GL_TRUE;
78 c->first_free_grf = r + 1; /* a guess */
79 return r;
80 }
81 }
82
83 /* no free temps, try to reclaim some */
84 reclaim_temps(c);
85 c->first_free_grf = 0;
86
87 /* try alloc again */
88 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
89 if (!c->used_grf[r]) {
90 c->used_grf[r] = GL_TRUE;
91 c->first_free_grf = r + 1; /* a guess */
92 return r;
93 }
94 }
95
96 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
97 assert(c->used_grf[r]);
98 }
99 /*printf("Really out of temp regs!\n");*/
100 return 60;
101 }
102
103
104 /** Return number of GRF registers used */
105 static int
106 num_grf_used(const struct brw_wm_compile *c)
107 {
108 int r;
109 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
110 if (c->used_grf[r])
111 return r + 1;
112 return 0;
113 }
114
115
116
117 /**
118 * Record the mapping of a Mesa register to a hardware register.
119 */
120 static void set_reg(struct brw_wm_compile *c, int file, int index,
121 int component, struct brw_reg reg)
122 {
123 c->wm_regs[file][index][component].reg = reg;
124 c->wm_regs[file][index][component].inited = GL_TRUE;
125 }
126
127 /**
128 * Examine instruction's write mask to find index of first component
129 * enabled for writing.
130 */
131 static int get_scalar_dst_index(const struct prog_instruction *inst)
132 {
133 int i;
134 for (i = 0; i < 4; i++)
135 if (inst->DstReg.WriteMask & (1<<i))
136 break;
137 return i;
138 }
139
140 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
141 {
142 struct brw_reg reg;
143
144 /* if we need to allocate another temp, grow the tmp_regs[] array */
145 if (c->tmp_index == c->tmp_max) {
146 c->tmp_regs[ c->tmp_max++ ] = alloc_grf(c);
147 }
148
149 /* form the GRF register */
150 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
151 /*printf("alloc_temp %d\n", reg.nr);*/
152 assert(reg.nr < BRW_WM_MAX_GRF);
153 return reg;
154
155 }
156
157 /**
158 * Save current temp register info.
159 * There must be a matching call to release_tmps().
160 */
161 static int mark_tmps(struct brw_wm_compile *c)
162 {
163 return c->tmp_index;
164 }
165
166 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
167 {
168 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
169 }
170
171 static void release_tmps(struct brw_wm_compile *c, int mark)
172 {
173 c->tmp_index = mark;
174 }
175
176 /**
177 * Convert Mesa src register to brw register.
178 *
179 * Since we're running in SOA mode each Mesa register corresponds to four
180 * hardware registers. We allocate the hardware registers as needed here.
181 *
182 * \param file register file, one of PROGRAM_x
183 * \param index register number
184 * \param component src component (X=0, Y=1, Z=2, W=3)
185 * \param nr not used?!?
186 * \param neg negate value?
187 * \param abs take absolute value?
188 */
189 static struct brw_reg
190 get_reg(struct brw_wm_compile *c, int file, int index, int component,
191 int nr, GLuint neg, GLuint abs)
192 {
193 struct brw_reg reg;
194 switch (file) {
195 case PROGRAM_STATE_VAR:
196 case PROGRAM_CONSTANT:
197 case PROGRAM_UNIFORM:
198 file = PROGRAM_STATE_VAR;
199 break;
200 case PROGRAM_UNDEFINED:
201 return brw_null_reg();
202 case PROGRAM_TEMPORARY:
203 case PROGRAM_INPUT:
204 case PROGRAM_OUTPUT:
205 case PROGRAM_PAYLOAD:
206 break;
207 default:
208 _mesa_problem(NULL, "Unexpected file in get_reg()");
209 return brw_null_reg();
210 }
211
212 assert(index < 256);
213 /* see if we've already allocated a HW register for this Mesa register */
214 if (c->wm_regs[file][index][component].inited) {
215 /* yes, re-use */
216 reg = c->wm_regs[file][index][component].reg;
217 }
218 else {
219 /* no, allocate new register */
220 int grf = alloc_grf(c);
221 if (grf < 0) {
222 /* totally out of temps */
223 grf = 70; /* XXX !!!! */
224 }
225
226 reg = brw_vec8_grf(grf, 0);
227 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
228
229 set_reg(c, file, index, component, reg);
230 }
231
232 if (neg & (1 << component)) {
233 reg = negate(reg);
234 }
235 if (abs)
236 reg = brw_abs(reg);
237 return reg;
238 }
239
240
241
242 /**
243 * This is called if we run out of GRF registers. Examine the live intervals
244 * of temp regs in the program and free those which won't be used again.
245 */
246 static void
247 reclaim_temps(struct brw_wm_compile *c)
248 {
249 GLint intBegin[MAX_PROGRAM_TEMPS];
250 GLint intEnd[MAX_PROGRAM_TEMPS];
251 int index;
252
253 /*printf("Reclaim temps:\n");*/
254
255 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
256 intBegin, intEnd);
257
258 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
259 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
260 /* program temp[i] can be freed */
261 int component;
262 /*printf(" temp[%d] is dead\n", index);*/
263 for (component = 0; component < 4; component++) {
264 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
265 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
266 release_grf(c, r);
267 /*
268 printf(" Reclaim temp %d, reg %d at inst %d\n",
269 index, r, c->cur_inst);
270 */
271 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
272 }
273 }
274 }
275 }
276 }
277
278
279
280
281 /**
282 * Preallocate registers. This sets up the Mesa to hardware register
283 * mapping for certain registers, such as constants (uniforms/state vars)
284 * and shader inputs.
285 */
286 static void prealloc_reg(struct brw_wm_compile *c)
287 {
288 int i, j;
289 struct brw_reg reg;
290 int nr_interp_regs = 0;
291 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
292 GLuint reg_index = 0;
293
294 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
295 c->first_free_grf = 0;
296
297 for (i = 0; i < 4; i++) {
298 if (i < c->key.nr_depth_regs)
299 reg = brw_vec8_grf(i * 2, 0);
300 else
301 reg = brw_vec8_grf(0, 0);
302 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
303 }
304 reg_index += 2 * c->key.nr_depth_regs;
305
306 /* constants */
307 {
308 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
309 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
310
311 /* use a real constant buffer, or just use a section of the GRF? */
312 /* XXX this heuristic may need adjustment... */
313 if ((nr_params + nr_temps) * 4 + reg_index > 80)
314 c->fp->use_const_buffer = GL_TRUE;
315 else
316 c->fp->use_const_buffer = GL_FALSE;
317 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
318
319 if (c->fp->use_const_buffer) {
320 /* We'll use a real constant buffer and fetch constants from
321 * it with a dataport read message.
322 */
323
324 /* number of float constants in CURBE */
325 c->prog_data.nr_params = 0;
326 }
327 else {
328 const struct gl_program_parameter_list *plist =
329 c->fp->program.Base.Parameters;
330 int index = 0;
331
332 /* number of float constants in CURBE */
333 c->prog_data.nr_params = 4 * nr_params;
334
335 /* loop over program constants (float[4]) */
336 for (i = 0; i < nr_params; i++) {
337 /* loop over XYZW channels */
338 for (j = 0; j < 4; j++, index++) {
339 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
340 /* Save pointer to parameter/constant value.
341 * Constants will be copied in prepare_constant_buffer()
342 */
343 c->prog_data.param[index] = &plist->ParameterValues[i][j];
344 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
345 }
346 }
347 /* number of constant regs used (each reg is float[8]) */
348 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
349 reg_index += c->nr_creg;
350 }
351 }
352
353 /* fragment shader inputs */
354 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
355 if (inputs & (1<<i)) {
356 nr_interp_regs++;
357 reg = brw_vec8_grf(reg_index, 0);
358 for (j = 0; j < 4; j++)
359 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
360 reg_index += 2;
361 }
362 }
363
364 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
365 c->prog_data.urb_read_length = nr_interp_regs * 2;
366 c->prog_data.curb_read_length = c->nr_creg;
367 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
368 reg_index++;
369 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
370 reg_index += 2;
371
372 /* mark GRF regs [0..reg_index-1] as in-use */
373 for (i = 0; i < reg_index; i++)
374 prealloc_grf(c, i);
375
376 /* An instruction may reference up to three constants.
377 * They'll be found in these registers.
378 * XXX alloc these on demand!
379 */
380 if (c->fp->use_const_buffer) {
381 for (i = 0; i < 3; i++) {
382 c->current_const[i].index = -1;
383 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
384 }
385 }
386 #if 0
387 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
388 printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
389 #endif
390 }
391
392
393 /**
394 * Check if any of the instruction's src registers are constants, uniforms,
395 * or statevars. If so, fetch any constants that we don't already have in
396 * the three GRF slots.
397 */
398 static void fetch_constants(struct brw_wm_compile *c,
399 const struct prog_instruction *inst)
400 {
401 struct brw_compile *p = &c->func;
402 GLuint i;
403
404 /* loop over instruction src regs */
405 for (i = 0; i < 3; i++) {
406 const struct prog_src_register *src = &inst->SrcReg[i];
407 if (src->File == PROGRAM_STATE_VAR ||
408 src->File == PROGRAM_CONSTANT ||
409 src->File == PROGRAM_UNIFORM) {
410 if (c->current_const[i].index != src->Index) {
411 c->current_const[i].index = src->Index;
412
413 #if 0
414 printf(" fetch const[%d] for arg %d into reg %d\n",
415 src->Index, i, c->current_const[i].reg.nr);
416 #endif
417
418 /* need to fetch the constant now */
419 brw_dp_READ_4(p,
420 c->current_const[i].reg, /* writeback dest */
421 1, /* msg_reg */
422 src->RelAddr, /* relative indexing? */
423 16 * src->Index, /* byte offset */
424 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
425 );
426 }
427 }
428 }
429 }
430
431
432 /**
433 * Convert Mesa dst register to brw register.
434 */
435 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
436 const struct prog_instruction *inst,
437 GLuint component)
438 {
439 const int nr = 1;
440 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
441 0, 0);
442 }
443
444
445 static struct brw_reg
446 get_src_reg_const(struct brw_wm_compile *c,
447 const struct prog_instruction *inst,
448 GLuint srcRegIndex, GLuint component)
449 {
450 /* We should have already fetched the constant from the constant
451 * buffer in fetch_constants(). Now we just have to return a
452 * register description that extracts the needed component and
453 * smears it across all eight vector components.
454 */
455 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
456 struct brw_reg const_reg;
457
458 assert(component < 4);
459 assert(srcRegIndex < 3);
460 assert(c->current_const[srcRegIndex].index != -1);
461 const_reg = c->current_const[srcRegIndex].reg;
462
463 /* extract desired float from the const_reg, and smear */
464 const_reg = stride(const_reg, 0, 1, 0);
465 const_reg.subnr = component * 4;
466
467 if (src->Negate & (1 << component))
468 const_reg = negate(const_reg);
469 if (src->Abs)
470 const_reg = brw_abs(const_reg);
471
472 #if 0
473 printf(" form const[%d].%d for arg %d, reg %d\n",
474 c->current_const[srcRegIndex].index,
475 component,
476 srcRegIndex,
477 const_reg.nr);
478 #endif
479
480 return const_reg;
481 }
482
483
484 /**
485 * Convert Mesa src register to brw register.
486 */
487 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
488 const struct prog_instruction *inst,
489 GLuint srcRegIndex, GLuint channel)
490 {
491 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
492 const GLuint nr = 1;
493 const GLuint component = GET_SWZ(src->Swizzle, channel);
494
495 if (c->fp->use_const_buffer &&
496 (src->File == PROGRAM_STATE_VAR ||
497 src->File == PROGRAM_CONSTANT ||
498 src->File == PROGRAM_UNIFORM)) {
499 return get_src_reg_const(c, inst, srcRegIndex, component);
500 }
501 else {
502 /* other type of source register */
503 return get_reg(c, src->File, src->Index, component, nr,
504 src->Negate, src->Abs);
505 }
506 }
507
508
509 /**
510 * Same as \sa get_src_reg() but if the register is a literal, emit
511 * a brw_reg encoding the literal.
512 * Note that a brw instruction only allows one src operand to be a literal.
513 * For instructions with more than one operand, only the second can be a
514 * literal. This means that we treat some literals as constants/uniforms
515 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
516 *
517 */
518 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
519 const struct prog_instruction *inst,
520 GLuint srcRegIndex, GLuint channel)
521 {
522 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
523 if (src->File == PROGRAM_CONSTANT) {
524 /* a literal */
525 const int component = GET_SWZ(src->Swizzle, channel);
526 const GLfloat *param =
527 c->fp->program.Base.Parameters->ParameterValues[src->Index];
528 GLfloat value = param[component];
529 if (src->Negate & (1 << channel))
530 value = -value;
531 if (src->Abs)
532 value = FABSF(value);
533 #if 0
534 printf(" form immed value %f for chan %d\n", value, channel);
535 #endif
536 return brw_imm_f(value);
537 }
538 else {
539 return get_src_reg(c, inst, srcRegIndex, channel);
540 }
541 }
542
543
544 /**
545 * Subroutines are minimal support for resusable instruction sequences.
546 * They are implemented as simply as possible to minimise overhead: there
547 * is no explicit support for communication between the caller and callee
548 * other than saving the return address in a temporary register, nor is
549 * there any automatic local storage. This implies that great care is
550 * required before attempting reentrancy or any kind of nested
551 * subroutine invocations.
552 */
553 static void invoke_subroutine( struct brw_wm_compile *c,
554 enum _subroutine subroutine,
555 void (*emit)( struct brw_wm_compile * ) )
556 {
557 struct brw_compile *p = &c->func;
558
559 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
560
561 if( c->subroutines[ subroutine ] ) {
562 /* subroutine previously emitted: reuse existing instructions */
563
564 int mark = mark_tmps( c );
565 struct brw_reg return_address = retype( alloc_tmp( c ),
566 BRW_REGISTER_TYPE_UD );
567 int here = p->nr_insn;
568
569 brw_push_insn_state(p);
570 brw_set_mask_control(p, BRW_MASK_DISABLE);
571 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
572
573 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
574 brw_imm_d( ( c->subroutines[ subroutine ] -
575 here - 1 ) << 4 ) );
576 brw_pop_insn_state(p);
577
578 release_tmps( c, mark );
579 } else {
580 /* previously unused subroutine: emit, and mark for later reuse */
581
582 int mark = mark_tmps( c );
583 struct brw_reg return_address = retype( alloc_tmp( c ),
584 BRW_REGISTER_TYPE_UD );
585 struct brw_instruction *calc;
586 int base = p->nr_insn;
587
588 brw_push_insn_state(p);
589 brw_set_mask_control(p, BRW_MASK_DISABLE);
590 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
591 brw_pop_insn_state(p);
592
593 c->subroutines[ subroutine ] = p->nr_insn;
594
595 emit( c );
596
597 brw_push_insn_state(p);
598 brw_set_mask_control(p, BRW_MASK_DISABLE);
599 brw_MOV( p, brw_ip_reg(), return_address );
600 brw_pop_insn_state(p);
601
602 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
603
604 release_tmps( c, mark );
605 }
606 }
607
608 static void emit_abs( struct brw_wm_compile *c,
609 const struct prog_instruction *inst)
610 {
611 int i;
612 struct brw_compile *p = &c->func;
613 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
614 for (i = 0; i < 4; i++) {
615 if (inst->DstReg.WriteMask & (1<<i)) {
616 struct brw_reg src, dst;
617 dst = get_dst_reg(c, inst, i);
618 src = get_src_reg(c, inst, 0, i);
619 brw_MOV(p, dst, brw_abs(src));
620 }
621 }
622 brw_set_saturate(p, 0);
623 }
624
625 static void emit_trunc( struct brw_wm_compile *c,
626 const struct prog_instruction *inst)
627 {
628 int i;
629 struct brw_compile *p = &c->func;
630 GLuint mask = inst->DstReg.WriteMask;
631 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
632 for (i = 0; i < 4; i++) {
633 if (mask & (1<<i)) {
634 struct brw_reg src, dst;
635 dst = get_dst_reg(c, inst, i);
636 src = get_src_reg(c, inst, 0, i);
637 brw_RNDZ(p, dst, src);
638 }
639 }
640 brw_set_saturate(p, 0);
641 }
642
643 static void emit_mov( struct brw_wm_compile *c,
644 const struct prog_instruction *inst)
645 {
646 int i;
647 struct brw_compile *p = &c->func;
648 GLuint mask = inst->DstReg.WriteMask;
649 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
650 for (i = 0; i < 4; i++) {
651 if (mask & (1<<i)) {
652 struct brw_reg src, dst;
653 dst = get_dst_reg(c, inst, i);
654 /* XXX some moves from immediate value don't work reliably!!! */
655 /*src = get_src_reg_imm(c, inst, 0, i);*/
656 src = get_src_reg(c, inst, 0, i);
657 brw_MOV(p, dst, src);
658 }
659 }
660 brw_set_saturate(p, 0);
661 }
662
663 static void emit_pixel_xy(struct brw_wm_compile *c,
664 const struct prog_instruction *inst)
665 {
666 struct brw_reg r1 = brw_vec1_grf(1, 0);
667 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
668
669 struct brw_reg dst0, dst1;
670 struct brw_compile *p = &c->func;
671 GLuint mask = inst->DstReg.WriteMask;
672
673 dst0 = get_dst_reg(c, inst, 0);
674 dst1 = get_dst_reg(c, inst, 1);
675 /* Calculate pixel centers by adding 1 or 0 to each of the
676 * micro-tile coordinates passed in r1.
677 */
678 if (mask & WRITEMASK_X) {
679 brw_ADD(p,
680 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
681 stride(suboffset(r1_uw, 4), 2, 4, 0),
682 brw_imm_v(0x10101010));
683 }
684
685 if (mask & WRITEMASK_Y) {
686 brw_ADD(p,
687 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
688 stride(suboffset(r1_uw, 5), 2, 4, 0),
689 brw_imm_v(0x11001100));
690 }
691 }
692
693 static void emit_delta_xy(struct brw_wm_compile *c,
694 const struct prog_instruction *inst)
695 {
696 struct brw_reg r1 = brw_vec1_grf(1, 0);
697 struct brw_reg dst0, dst1, src0, src1;
698 struct brw_compile *p = &c->func;
699 GLuint mask = inst->DstReg.WriteMask;
700
701 dst0 = get_dst_reg(c, inst, 0);
702 dst1 = get_dst_reg(c, inst, 1);
703 src0 = get_src_reg(c, inst, 0, 0);
704 src1 = get_src_reg(c, inst, 0, 1);
705 /* Calc delta X,Y by subtracting origin in r1 from the pixel
706 * centers.
707 */
708 if (mask & WRITEMASK_X) {
709 brw_ADD(p,
710 dst0,
711 retype(src0, BRW_REGISTER_TYPE_UW),
712 negate(r1));
713 }
714
715 if (mask & WRITEMASK_Y) {
716 brw_ADD(p,
717 dst1,
718 retype(src1, BRW_REGISTER_TYPE_UW),
719 negate(suboffset(r1,1)));
720
721 }
722 }
723
724 static void fire_fb_write( struct brw_wm_compile *c,
725 GLuint base_reg,
726 GLuint nr,
727 GLuint target,
728 GLuint eot)
729 {
730 struct brw_compile *p = &c->func;
731 /* Pass through control information:
732 */
733 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
734 {
735 brw_push_insn_state(p);
736 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
737 brw_MOV(p,
738 brw_message_reg(base_reg + 1),
739 brw_vec8_grf(1, 0));
740 brw_pop_insn_state(p);
741 }
742 /* Send framebuffer write message: */
743 brw_fb_WRITE(p,
744 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
745 base_reg,
746 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
747 target,
748 nr,
749 0,
750 eot);
751 }
752
753 static void emit_fb_write(struct brw_wm_compile *c,
754 const struct prog_instruction *inst)
755 {
756 struct brw_compile *p = &c->func;
757 int nr = 2;
758 int channel;
759 GLuint target, eot;
760 struct brw_reg src0;
761
762 /* Reserve a space for AA - may not be needed:
763 */
764 if (c->key.aa_dest_stencil_reg)
765 nr += 1;
766
767 brw_push_insn_state(p);
768 for (channel = 0; channel < 4; channel++) {
769 src0 = get_src_reg(c, inst, 0, channel);
770 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
771 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
772 brw_MOV(p, brw_message_reg(nr + channel), src0);
773 }
774 /* skip over the regs populated above: */
775 nr += 8;
776 brw_pop_insn_state(p);
777
778 if (c->key.source_depth_to_render_target) {
779 if (c->key.computes_depth) {
780 src0 = get_src_reg(c, inst, 2, 2);
781 brw_MOV(p, brw_message_reg(nr), src0);
782 }
783 else {
784 src0 = get_src_reg(c, inst, 1, 1);
785 brw_MOV(p, brw_message_reg(nr), src0);
786 }
787
788 nr += 2;
789 }
790
791 if (c->key.dest_depth_reg) {
792 GLuint comp = c->key.dest_depth_reg / 2;
793 GLuint off = c->key.dest_depth_reg % 2;
794
795 assert(comp == 1);
796 assert(off == 0);
797 #if 0
798 /* XXX do we need this code? comp always 1, off always 0, it seems */
799 if (off != 0) {
800 brw_push_insn_state(p);
801 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
802
803 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
804 /* 2nd half? */
805 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
806 brw_pop_insn_state(p);
807 }
808 else
809 #endif
810 {
811 struct brw_reg src = get_src_reg(c, inst, 1, 1);
812 brw_MOV(p, brw_message_reg(nr), src);
813 }
814 nr += 2;
815 }
816
817 target = inst->Aux >> 1;
818 eot = inst->Aux & 1;
819 fire_fb_write(c, 0, nr, target, eot);
820 }
821
822 static void emit_pixel_w( struct brw_wm_compile *c,
823 const struct prog_instruction *inst)
824 {
825 struct brw_compile *p = &c->func;
826 GLuint mask = inst->DstReg.WriteMask;
827 if (mask & WRITEMASK_W) {
828 struct brw_reg dst, src0, delta0, delta1;
829 struct brw_reg interp3;
830
831 dst = get_dst_reg(c, inst, 3);
832 src0 = get_src_reg(c, inst, 0, 0);
833 delta0 = get_src_reg(c, inst, 1, 0);
834 delta1 = get_src_reg(c, inst, 1, 1);
835
836 interp3 = brw_vec1_grf(src0.nr+1, 4);
837 /* Calc 1/w - just linterp wpos[3] optimized by putting the
838 * result straight into a message reg.
839 */
840 brw_LINE(p, brw_null_reg(), interp3, delta0);
841 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
842
843 /* Calc w */
844 brw_math_16( p, dst,
845 BRW_MATH_FUNCTION_INV,
846 BRW_MATH_SATURATE_NONE,
847 2, brw_null_reg(),
848 BRW_MATH_PRECISION_FULL);
849 }
850 }
851
852 static void emit_linterp(struct brw_wm_compile *c,
853 const struct prog_instruction *inst)
854 {
855 struct brw_compile *p = &c->func;
856 GLuint mask = inst->DstReg.WriteMask;
857 struct brw_reg interp[4];
858 struct brw_reg dst, delta0, delta1;
859 struct brw_reg src0;
860 GLuint nr, i;
861
862 src0 = get_src_reg(c, inst, 0, 0);
863 delta0 = get_src_reg(c, inst, 1, 0);
864 delta1 = get_src_reg(c, inst, 1, 1);
865 nr = src0.nr;
866
867 interp[0] = brw_vec1_grf(nr, 0);
868 interp[1] = brw_vec1_grf(nr, 4);
869 interp[2] = brw_vec1_grf(nr+1, 0);
870 interp[3] = brw_vec1_grf(nr+1, 4);
871
872 for(i = 0; i < 4; i++ ) {
873 if (mask & (1<<i)) {
874 dst = get_dst_reg(c, inst, i);
875 brw_LINE(p, brw_null_reg(), interp[i], delta0);
876 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
877 }
878 }
879 }
880
881 static void emit_cinterp(struct brw_wm_compile *c,
882 const struct prog_instruction *inst)
883 {
884 struct brw_compile *p = &c->func;
885 GLuint mask = inst->DstReg.WriteMask;
886
887 struct brw_reg interp[4];
888 struct brw_reg dst, src0;
889 GLuint nr, i;
890
891 src0 = get_src_reg(c, inst, 0, 0);
892 nr = src0.nr;
893
894 interp[0] = brw_vec1_grf(nr, 0);
895 interp[1] = brw_vec1_grf(nr, 4);
896 interp[2] = brw_vec1_grf(nr+1, 0);
897 interp[3] = brw_vec1_grf(nr+1, 4);
898
899 for(i = 0; i < 4; i++ ) {
900 if (mask & (1<<i)) {
901 dst = get_dst_reg(c, inst, i);
902 brw_MOV(p, dst, suboffset(interp[i],3));
903 }
904 }
905 }
906
907 static void emit_pinterp(struct brw_wm_compile *c,
908 const struct prog_instruction *inst)
909 {
910 struct brw_compile *p = &c->func;
911 GLuint mask = inst->DstReg.WriteMask;
912
913 struct brw_reg interp[4];
914 struct brw_reg dst, delta0, delta1;
915 struct brw_reg src0, w;
916 GLuint nr, i;
917
918 src0 = get_src_reg(c, inst, 0, 0);
919 delta0 = get_src_reg(c, inst, 1, 0);
920 delta1 = get_src_reg(c, inst, 1, 1);
921 w = get_src_reg(c, inst, 2, 3);
922 nr = src0.nr;
923
924 interp[0] = brw_vec1_grf(nr, 0);
925 interp[1] = brw_vec1_grf(nr, 4);
926 interp[2] = brw_vec1_grf(nr+1, 0);
927 interp[3] = brw_vec1_grf(nr+1, 4);
928
929 for(i = 0; i < 4; i++ ) {
930 if (mask & (1<<i)) {
931 dst = get_dst_reg(c, inst, i);
932 brw_LINE(p, brw_null_reg(), interp[i], delta0);
933 brw_MAC(p, dst, suboffset(interp[i],1),
934 delta1);
935 brw_MUL(p, dst, dst, w);
936 }
937 }
938 }
939
940 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
941 static void emit_frontfacing(struct brw_wm_compile *c,
942 const struct prog_instruction *inst)
943 {
944 struct brw_compile *p = &c->func;
945 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
946 struct brw_reg dst;
947 GLuint mask = inst->DstReg.WriteMask;
948 int i;
949
950 for (i = 0; i < 4; i++) {
951 if (mask & (1<<i)) {
952 dst = get_dst_reg(c, inst, i);
953 brw_MOV(p, dst, brw_imm_f(0.0));
954 }
955 }
956
957 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
958 * us front face
959 */
960 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
961 for (i = 0; i < 4; i++) {
962 if (mask & (1<<i)) {
963 dst = get_dst_reg(c, inst, i);
964 brw_MOV(p, dst, brw_imm_f(1.0));
965 }
966 }
967 brw_set_predicate_control_flag_value(p, 0xff);
968 }
969
970 static void emit_xpd(struct brw_wm_compile *c,
971 const struct prog_instruction *inst)
972 {
973 int i;
974 struct brw_compile *p = &c->func;
975 GLuint mask = inst->DstReg.WriteMask;
976 for (i = 0; i < 4; i++) {
977 GLuint i2 = (i+2)%3;
978 GLuint i1 = (i+1)%3;
979 if (mask & (1<<i)) {
980 struct brw_reg src0, src1, dst;
981 dst = get_dst_reg(c, inst, i);
982 src0 = negate(get_src_reg(c, inst, 0, i2));
983 src1 = get_src_reg_imm(c, inst, 1, i1);
984 brw_MUL(p, brw_null_reg(), src0, src1);
985 src0 = get_src_reg(c, inst, 0, i1);
986 src1 = get_src_reg_imm(c, inst, 1, i2);
987 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
988 brw_MAC(p, dst, src0, src1);
989 brw_set_saturate(p, 0);
990 }
991 }
992 brw_set_saturate(p, 0);
993 }
994
995 static void emit_dp3(struct brw_wm_compile *c,
996 const struct prog_instruction *inst)
997 {
998 struct brw_reg src0[3], src1[3], dst;
999 int i;
1000 struct brw_compile *p = &c->func;
1001 for (i = 0; i < 3; i++) {
1002 src0[i] = get_src_reg(c, inst, 0, i);
1003 src1[i] = get_src_reg_imm(c, inst, 1, i);
1004 }
1005
1006 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1007 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1008 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1009 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1010 brw_MAC(p, dst, src0[2], src1[2]);
1011 brw_set_saturate(p, 0);
1012 }
1013
1014 static void emit_dp4(struct brw_wm_compile *c,
1015 const struct prog_instruction *inst)
1016 {
1017 struct brw_reg src0[4], src1[4], dst;
1018 int i;
1019 struct brw_compile *p = &c->func;
1020 for (i = 0; i < 4; i++) {
1021 src0[i] = get_src_reg(c, inst, 0, i);
1022 src1[i] = get_src_reg_imm(c, inst, 1, i);
1023 }
1024 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1025 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1026 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1027 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1028 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1029 brw_MAC(p, dst, src0[3], src1[3]);
1030 brw_set_saturate(p, 0);
1031 }
1032
1033 static void emit_dph(struct brw_wm_compile *c,
1034 const struct prog_instruction *inst)
1035 {
1036 struct brw_reg src0[4], src1[4], dst;
1037 int i;
1038 struct brw_compile *p = &c->func;
1039 for (i = 0; i < 4; i++) {
1040 src0[i] = get_src_reg(c, inst, 0, i);
1041 src1[i] = get_src_reg_imm(c, inst, 1, i);
1042 }
1043 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1044 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1045 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1046 brw_MAC(p, dst, src0[2], src1[2]);
1047 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1048 brw_ADD(p, dst, dst, src1[3]);
1049 brw_set_saturate(p, 0);
1050 }
1051
1052 /**
1053 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1054 * Note that the result of the function is smeared across the dest
1055 * register's X, Y, Z and W channels (subject to writemasking of course).
1056 */
1057 static void emit_math1(struct brw_wm_compile *c,
1058 const struct prog_instruction *inst, GLuint func)
1059 {
1060 struct brw_compile *p = &c->func;
1061 struct brw_reg src0, dst, tmp;
1062 const int mark = mark_tmps( c );
1063 int i;
1064
1065 tmp = alloc_tmp(c);
1066
1067 /* Get first component of source register */
1068 src0 = get_src_reg(c, inst, 0, 0);
1069
1070 /* tmp = func(src0) */
1071 brw_MOV(p, brw_message_reg(2), src0);
1072 brw_math(p,
1073 tmp,
1074 func,
1075 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1076 2,
1077 brw_null_reg(),
1078 BRW_MATH_DATA_VECTOR,
1079 BRW_MATH_PRECISION_FULL);
1080
1081 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1082
1083 /* replicate tmp value across enabled dest channels */
1084 for (i = 0; i < 4; i++) {
1085 if (inst->DstReg.WriteMask & (1 << i)) {
1086 dst = get_dst_reg(c, inst, i);
1087 brw_MOV(p, dst, tmp);
1088 }
1089 }
1090
1091 release_tmps(c, mark);
1092 }
1093
1094 static void emit_rcp(struct brw_wm_compile *c,
1095 const struct prog_instruction *inst)
1096 {
1097 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1098 }
1099
1100 static void emit_rsq(struct brw_wm_compile *c,
1101 const struct prog_instruction *inst)
1102 {
1103 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1104 }
1105
1106 static void emit_sin(struct brw_wm_compile *c,
1107 const struct prog_instruction *inst)
1108 {
1109 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1110 }
1111
1112 static void emit_cos(struct brw_wm_compile *c,
1113 const struct prog_instruction *inst)
1114 {
1115 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1116 }
1117
1118 static void emit_ex2(struct brw_wm_compile *c,
1119 const struct prog_instruction *inst)
1120 {
1121 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1122 }
1123
1124 static void emit_lg2(struct brw_wm_compile *c,
1125 const struct prog_instruction *inst)
1126 {
1127 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1128 }
1129
1130 static void emit_add(struct brw_wm_compile *c,
1131 const struct prog_instruction *inst)
1132 {
1133 struct brw_compile *p = &c->func;
1134 struct brw_reg src0, src1, dst;
1135 GLuint mask = inst->DstReg.WriteMask;
1136 int i;
1137 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1138 for (i = 0 ; i < 4; i++) {
1139 if (mask & (1<<i)) {
1140 dst = get_dst_reg(c, inst, i);
1141 src0 = get_src_reg(c, inst, 0, i);
1142 src1 = get_src_reg_imm(c, inst, 1, i);
1143 brw_ADD(p, dst, src0, src1);
1144 }
1145 }
1146 brw_set_saturate(p, 0);
1147 }
1148
1149 static void emit_arl(struct brw_wm_compile *c,
1150 const struct prog_instruction *inst)
1151 {
1152 struct brw_compile *p = &c->func;
1153 struct brw_reg src0, addr_reg;
1154 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1155 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1156 BRW_ARF_ADDRESS, 0);
1157 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1158 brw_MOV(p, addr_reg, src0);
1159 brw_set_saturate(p, 0);
1160 }
1161
1162 static void emit_sub(struct brw_wm_compile *c,
1163 const struct prog_instruction *inst)
1164 {
1165 struct brw_compile *p = &c->func;
1166 struct brw_reg src0, src1, dst;
1167 GLuint mask = inst->DstReg.WriteMask;
1168 int i;
1169 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1170 for (i = 0 ; i < 4; i++) {
1171 if (mask & (1<<i)) {
1172 dst = get_dst_reg(c, inst, i);
1173 src0 = get_src_reg(c, inst, 0, i);
1174 src1 = get_src_reg_imm(c, inst, 1, i);
1175 brw_ADD(p, dst, src0, negate(src1));
1176 }
1177 }
1178 brw_set_saturate(p, 0);
1179 }
1180
1181 static void emit_mul(struct brw_wm_compile *c,
1182 const struct prog_instruction *inst)
1183 {
1184 struct brw_compile *p = &c->func;
1185 struct brw_reg src0, src1, dst;
1186 GLuint mask = inst->DstReg.WriteMask;
1187 int i;
1188 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1189 for (i = 0 ; i < 4; i++) {
1190 if (mask & (1<<i)) {
1191 dst = get_dst_reg(c, inst, i);
1192 src0 = get_src_reg(c, inst, 0, i);
1193 src1 = get_src_reg_imm(c, inst, 1, i);
1194 brw_MUL(p, dst, src0, src1);
1195 }
1196 }
1197 brw_set_saturate(p, 0);
1198 }
1199
1200 static void emit_frc(struct brw_wm_compile *c,
1201 const struct prog_instruction *inst)
1202 {
1203 struct brw_compile *p = &c->func;
1204 struct brw_reg src0, dst;
1205 GLuint mask = inst->DstReg.WriteMask;
1206 int i;
1207 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1208 for (i = 0 ; i < 4; i++) {
1209 if (mask & (1<<i)) {
1210 dst = get_dst_reg(c, inst, i);
1211 src0 = get_src_reg_imm(c, inst, 0, i);
1212 brw_FRC(p, dst, src0);
1213 }
1214 }
1215 if (inst->SaturateMode != SATURATE_OFF)
1216 brw_set_saturate(p, 0);
1217 }
1218
1219 static void emit_flr(struct brw_wm_compile *c,
1220 const struct prog_instruction *inst)
1221 {
1222 struct brw_compile *p = &c->func;
1223 struct brw_reg src0, dst;
1224 GLuint mask = inst->DstReg.WriteMask;
1225 int i;
1226 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1227 for (i = 0 ; i < 4; i++) {
1228 if (mask & (1<<i)) {
1229 dst = get_dst_reg(c, inst, i);
1230 src0 = get_src_reg_imm(c, inst, 0, i);
1231 brw_RNDD(p, dst, src0);
1232 }
1233 }
1234 brw_set_saturate(p, 0);
1235 }
1236
1237
1238 static void emit_min_max(struct brw_wm_compile *c,
1239 const struct prog_instruction *inst)
1240 {
1241 struct brw_compile *p = &c->func;
1242 const GLuint mask = inst->DstReg.WriteMask;
1243 const int mark = mark_tmps(c);
1244 int i;
1245 brw_push_insn_state(p);
1246 for (i = 0; i < 4; i++) {
1247 if (mask & (1<<i)) {
1248 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1249 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1250 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1251 struct brw_reg dst;
1252 /* if dst==src0 or dst==src1 we need to use a temp reg */
1253 GLboolean use_temp = brw_same_reg(dst, src0) ||
1254 brw_same_reg(dst, src1);
1255 if (use_temp)
1256 dst = alloc_tmp(c);
1257 else
1258 dst = real_dst;
1259
1260 /*
1261 printf(" Min/max: dst %d src0 %d src1 %d\n",
1262 dst.nr, src0.nr, src1.nr);
1263 */
1264 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1265 brw_MOV(p, dst, src0);
1266 brw_set_saturate(p, 0);
1267
1268 if (inst->Opcode == OPCODE_MIN)
1269 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1270 else
1271 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1272
1273 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1274 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1275 brw_MOV(p, dst, src1);
1276 brw_set_saturate(p, 0);
1277 brw_set_predicate_control_flag_value(p, 0xff);
1278 if (use_temp)
1279 brw_MOV(p, real_dst, dst);
1280 }
1281 }
1282 brw_pop_insn_state(p);
1283 release_tmps(c, mark);
1284 }
1285
1286 static void emit_pow(struct brw_wm_compile *c,
1287 const struct prog_instruction *inst)
1288 {
1289 struct brw_compile *p = &c->func;
1290 struct brw_reg dst, src0, src1;
1291 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1292 src0 = get_src_reg_imm(c, inst, 0, 0);
1293 src1 = get_src_reg_imm(c, inst, 1, 0);
1294
1295 brw_MOV(p, brw_message_reg(2), src0);
1296 brw_MOV(p, brw_message_reg(3), src1);
1297
1298 brw_math(p,
1299 dst,
1300 BRW_MATH_FUNCTION_POW,
1301 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1302 2,
1303 brw_null_reg(),
1304 BRW_MATH_DATA_VECTOR,
1305 BRW_MATH_PRECISION_FULL);
1306 }
1307
1308 static void emit_lrp(struct brw_wm_compile *c,
1309 const struct prog_instruction *inst)
1310 {
1311 struct brw_compile *p = &c->func;
1312 GLuint mask = inst->DstReg.WriteMask;
1313 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1314 int i;
1315 int mark = mark_tmps(c);
1316 for (i = 0; i < 4; i++) {
1317 if (mask & (1<<i)) {
1318 dst = get_dst_reg(c, inst, i);
1319 src0 = get_src_reg(c, inst, 0, i);
1320
1321 src1 = get_src_reg_imm(c, inst, 1, i);
1322
1323 if (src1.nr == dst.nr) {
1324 tmp1 = alloc_tmp(c);
1325 brw_MOV(p, tmp1, src1);
1326 } else
1327 tmp1 = src1;
1328
1329 src2 = get_src_reg(c, inst, 2, i);
1330 if (src2.nr == dst.nr) {
1331 tmp2 = alloc_tmp(c);
1332 brw_MOV(p, tmp2, src2);
1333 } else
1334 tmp2 = src2;
1335
1336 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1337 brw_MUL(p, brw_null_reg(), dst, tmp2);
1338 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1339 brw_MAC(p, dst, src0, tmp1);
1340 brw_set_saturate(p, 0);
1341 }
1342 release_tmps(c, mark);
1343 }
1344 }
1345
1346 /**
1347 * For GLSL shaders, this KIL will be unconditional.
1348 * It may be contained inside an IF/ENDIF structure of course.
1349 */
1350 static void emit_kil(struct brw_wm_compile *c)
1351 {
1352 struct brw_compile *p = &c->func;
1353 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1354 brw_push_insn_state(p);
1355 brw_set_mask_control(p, BRW_MASK_DISABLE);
1356 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1357 brw_AND(p, depth, c->emit_mask_reg, depth);
1358 brw_pop_insn_state(p);
1359 }
1360
1361 static void emit_mad(struct brw_wm_compile *c,
1362 const struct prog_instruction *inst)
1363 {
1364 struct brw_compile *p = &c->func;
1365 GLuint mask = inst->DstReg.WriteMask;
1366 struct brw_reg dst, src0, src1, src2;
1367 int i;
1368
1369 for (i = 0; i < 4; i++) {
1370 if (mask & (1<<i)) {
1371 dst = get_dst_reg(c, inst, i);
1372 src0 = get_src_reg(c, inst, 0, i);
1373 src1 = get_src_reg_imm(c, inst, 1, i);
1374 src2 = get_src_reg_imm(c, inst, 2, i);
1375 brw_MUL(p, dst, src0, src1);
1376
1377 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1378 brw_ADD(p, dst, dst, src2);
1379 brw_set_saturate(p, 0);
1380 }
1381 }
1382 }
1383
1384 static void emit_sop(struct brw_wm_compile *c,
1385 const struct prog_instruction *inst, GLuint cond)
1386 {
1387 struct brw_compile *p = &c->func;
1388 GLuint mask = inst->DstReg.WriteMask;
1389 struct brw_reg dst, src0, src1;
1390 int i;
1391
1392 for (i = 0; i < 4; i++) {
1393 if (mask & (1<<i)) {
1394 dst = get_dst_reg(c, inst, i);
1395 src0 = get_src_reg(c, inst, 0, i);
1396 src1 = get_src_reg_imm(c, inst, 1, i);
1397 brw_push_insn_state(p);
1398 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1399 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1400 brw_MOV(p, dst, brw_imm_f(0.0));
1401 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1402 brw_MOV(p, dst, brw_imm_f(1.0));
1403 brw_pop_insn_state(p);
1404 }
1405 }
1406 }
1407
1408 static void emit_slt(struct brw_wm_compile *c,
1409 const struct prog_instruction *inst)
1410 {
1411 emit_sop(c, inst, BRW_CONDITIONAL_L);
1412 }
1413
1414 static void emit_sle(struct brw_wm_compile *c,
1415 const struct prog_instruction *inst)
1416 {
1417 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1418 }
1419
1420 static void emit_sgt(struct brw_wm_compile *c,
1421 const struct prog_instruction *inst)
1422 {
1423 emit_sop(c, inst, BRW_CONDITIONAL_G);
1424 }
1425
1426 static void emit_sge(struct brw_wm_compile *c,
1427 const struct prog_instruction *inst)
1428 {
1429 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1430 }
1431
1432 static void emit_seq(struct brw_wm_compile *c,
1433 const struct prog_instruction *inst)
1434 {
1435 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1436 }
1437
1438 static void emit_sne(struct brw_wm_compile *c,
1439 const struct prog_instruction *inst)
1440 {
1441 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1442 }
1443
1444 static void emit_ddx(struct brw_wm_compile *c,
1445 const struct prog_instruction *inst)
1446 {
1447 struct brw_compile *p = &c->func;
1448 GLuint mask = inst->DstReg.WriteMask;
1449 struct brw_reg interp[4];
1450 struct brw_reg dst;
1451 struct brw_reg src0, w;
1452 GLuint nr, i;
1453 src0 = get_src_reg(c, inst, 0, 0);
1454 w = get_src_reg(c, inst, 1, 3);
1455 nr = src0.nr;
1456 interp[0] = brw_vec1_grf(nr, 0);
1457 interp[1] = brw_vec1_grf(nr, 4);
1458 interp[2] = brw_vec1_grf(nr+1, 0);
1459 interp[3] = brw_vec1_grf(nr+1, 4);
1460 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1461 for(i = 0; i < 4; i++ ) {
1462 if (mask & (1<<i)) {
1463 dst = get_dst_reg(c, inst, i);
1464 brw_MOV(p, dst, interp[i]);
1465 brw_MUL(p, dst, dst, w);
1466 }
1467 }
1468 brw_set_saturate(p, 0);
1469 }
1470
1471 static void emit_ddy(struct brw_wm_compile *c,
1472 const struct prog_instruction *inst)
1473 {
1474 struct brw_compile *p = &c->func;
1475 GLuint mask = inst->DstReg.WriteMask;
1476 struct brw_reg interp[4];
1477 struct brw_reg dst;
1478 struct brw_reg src0, w;
1479 GLuint nr, i;
1480
1481 src0 = get_src_reg(c, inst, 0, 0);
1482 nr = src0.nr;
1483 w = get_src_reg(c, inst, 1, 3);
1484 interp[0] = brw_vec1_grf(nr, 0);
1485 interp[1] = brw_vec1_grf(nr, 4);
1486 interp[2] = brw_vec1_grf(nr+1, 0);
1487 interp[3] = brw_vec1_grf(nr+1, 4);
1488 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1489 for(i = 0; i < 4; i++ ) {
1490 if (mask & (1<<i)) {
1491 dst = get_dst_reg(c, inst, i);
1492 brw_MOV(p, dst, suboffset(interp[i], 1));
1493 brw_MUL(p, dst, dst, w);
1494 }
1495 }
1496 brw_set_saturate(p, 0);
1497 }
1498
1499 static INLINE struct brw_reg high_words( struct brw_reg reg )
1500 {
1501 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1502 0, 8, 2 );
1503 }
1504
1505 static INLINE struct brw_reg low_words( struct brw_reg reg )
1506 {
1507 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1508 }
1509
1510 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1511 {
1512 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1513 }
1514
1515 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1516 {
1517 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1518 0, 16, 2 );
1519 }
1520
1521 /* One-, two- and three-dimensional Perlin noise, similar to the description
1522 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1523 static void noise1_sub( struct brw_wm_compile *c ) {
1524
1525 struct brw_compile *p = &c->func;
1526 struct brw_reg param,
1527 x0, x1, /* gradients at each end */
1528 t, tmp[ 2 ], /* float temporaries */
1529 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1530 int i;
1531 int mark = mark_tmps( c );
1532
1533 x0 = alloc_tmp( c );
1534 x1 = alloc_tmp( c );
1535 t = alloc_tmp( c );
1536 tmp[ 0 ] = alloc_tmp( c );
1537 tmp[ 1 ] = alloc_tmp( c );
1538 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1539 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1540 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1541 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1542 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1543
1544 param = lookup_tmp( c, mark - 2 );
1545
1546 brw_set_access_mode( p, BRW_ALIGN_1 );
1547
1548 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1549
1550 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1551 be hashed. Also compute the remainder (offset within the unit
1552 length), interleaved to reduce register dependency penalties. */
1553 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1554 brw_FRC( p, param, param );
1555 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1556 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1557 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1558
1559 /* We're now ready to perform the hashing. The two hashes are
1560 interleaved for performance. The hash function used is
1561 designed to rapidly achieve avalanche and require only 32x16
1562 bit multiplication, and 16-bit swizzles (which we get for
1563 free). We can't use immediate operands in the multiplies,
1564 because immediates are permitted only in src1 and the 16-bit
1565 factor is permitted only in src0. */
1566 for( i = 0; i < 2; i++ )
1567 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1568 for( i = 0; i < 2; i++ )
1569 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1570 high_words( itmp[ i ] ) );
1571 for( i = 0; i < 2; i++ )
1572 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1573 for( i = 0; i < 2; i++ )
1574 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1575 high_words( itmp[ i ] ) );
1576 for( i = 0; i < 2; i++ )
1577 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1578 for( i = 0; i < 2; i++ )
1579 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1580 high_words( itmp[ i ] ) );
1581
1582 /* Now we want to initialise the two gradients based on the
1583 hashes. Format conversion from signed integer to float leaves
1584 everything scaled too high by a factor of pow( 2, 31 ), but
1585 we correct for that right at the end. */
1586 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1587 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1588 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1589
1590 brw_MUL( p, x0, x0, param );
1591 brw_MUL( p, x1, x1, t );
1592
1593 /* We interpolate between the gradients using the polynomial
1594 6t^5 - 15t^4 + 10t^3 (Perlin). */
1595 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1596 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1597 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1598 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1599 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1600 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1601 pipeline */
1602 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1603 brw_MUL( p, param, tmp[ 0 ], param );
1604 brw_MUL( p, x1, x1, param );
1605 brw_ADD( p, x0, x0, x1 );
1606 /* scale by pow( 2, -30 ), to compensate for the format conversion
1607 above and an extra factor of 2 so that a single gradient covers
1608 the [-1,1] range */
1609 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1610
1611 release_tmps( c, mark );
1612 }
1613
1614 static void emit_noise1( struct brw_wm_compile *c,
1615 const struct prog_instruction *inst )
1616 {
1617 struct brw_compile *p = &c->func;
1618 struct brw_reg src, param, dst;
1619 GLuint mask = inst->DstReg.WriteMask;
1620 int i;
1621 int mark = mark_tmps( c );
1622
1623 assert( mark == 0 );
1624
1625 src = get_src_reg( c, inst, 0, 0 );
1626
1627 param = alloc_tmp( c );
1628
1629 brw_MOV( p, param, src );
1630
1631 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1632
1633 /* Fill in the result: */
1634 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1635 for (i = 0 ; i < 4; i++) {
1636 if (mask & (1<<i)) {
1637 dst = get_dst_reg(c, inst, i);
1638 brw_MOV( p, dst, param );
1639 }
1640 }
1641 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1642 brw_set_saturate( p, 0 );
1643
1644 release_tmps( c, mark );
1645 }
1646
1647 static void noise2_sub( struct brw_wm_compile *c ) {
1648
1649 struct brw_compile *p = &c->func;
1650 struct brw_reg param0, param1,
1651 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1652 t, tmp[ 4 ], /* float temporaries */
1653 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1654 int i;
1655 int mark = mark_tmps( c );
1656
1657 x0y0 = alloc_tmp( c );
1658 x0y1 = alloc_tmp( c );
1659 x1y0 = alloc_tmp( c );
1660 x1y1 = alloc_tmp( c );
1661 t = alloc_tmp( c );
1662 for( i = 0; i < 4; i++ ) {
1663 tmp[ i ] = alloc_tmp( c );
1664 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1665 }
1666 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1667 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1668 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1669
1670 param0 = lookup_tmp( c, mark - 3 );
1671 param1 = lookup_tmp( c, mark - 2 );
1672
1673 brw_set_access_mode( p, BRW_ALIGN_1 );
1674
1675 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1676 be hashed. Also compute the remainders (offsets within the unit
1677 square), interleaved to reduce register dependency penalties. */
1678 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1679 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1680 brw_FRC( p, param0, param0 );
1681 brw_FRC( p, param1, param1 );
1682 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1683 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1684 low_words( itmp[ 1 ] ) );
1685 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1686 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1687 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1688 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1689 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1690
1691 /* We're now ready to perform the hashing. The four hashes are
1692 interleaved for performance. The hash function used is
1693 designed to rapidly achieve avalanche and require only 32x16
1694 bit multiplication, and 16-bit swizzles (which we get for
1695 free). We can't use immediate operands in the multiplies,
1696 because immediates are permitted only in src1 and the 16-bit
1697 factor is permitted only in src0. */
1698 for( i = 0; i < 4; i++ )
1699 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1700 for( i = 0; i < 4; i++ )
1701 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1702 high_words( itmp[ i ] ) );
1703 for( i = 0; i < 4; i++ )
1704 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1705 for( i = 0; i < 4; i++ )
1706 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1707 high_words( itmp[ i ] ) );
1708 for( i = 0; i < 4; i++ )
1709 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1710 for( i = 0; i < 4; i++ )
1711 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1712 high_words( itmp[ i ] ) );
1713
1714 /* Now we want to initialise the four gradients based on the
1715 hashes. Format conversion from signed integer to float leaves
1716 everything scaled too high by a factor of pow( 2, 15 ), but
1717 we correct for that right at the end. */
1718 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1719 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1720 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1721 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1722 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1723
1724 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1725 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1726 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1727 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1728
1729 brw_MUL( p, x1y0, x1y0, t );
1730 brw_MUL( p, x1y1, x1y1, t );
1731 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1732 brw_MUL( p, x0y0, x0y0, param0 );
1733 brw_MUL( p, x0y1, x0y1, param0 );
1734
1735 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1736 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1737 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1738 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1739
1740 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1741 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1742 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1743 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1744
1745 /* We interpolate between the gradients using the polynomial
1746 6t^5 - 15t^4 + 10t^3 (Perlin). */
1747 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1748 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1749 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1750 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1751 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1752 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1753 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1754 pipeline */
1755 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1756 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1757 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1758 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1759 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1760 pipeline */
1761 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1762 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1763 brw_MUL( p, param0, tmp[ 0 ], param0 );
1764 brw_MUL( p, param1, tmp[ 1 ], param1 );
1765
1766 /* Here we interpolate in the y dimension... */
1767 brw_MUL( p, x0y1, x0y1, param1 );
1768 brw_MUL( p, x1y1, x1y1, param1 );
1769 brw_ADD( p, x0y0, x0y0, x0y1 );
1770 brw_ADD( p, x1y0, x1y0, x1y1 );
1771
1772 /* And now in x. There are horrible register dependencies here,
1773 but we have nothing else to do. */
1774 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1775 brw_MUL( p, x1y0, x1y0, param0 );
1776 brw_ADD( p, x0y0, x0y0, x1y0 );
1777
1778 /* scale by pow( 2, -15 ), as described above */
1779 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1780
1781 release_tmps( c, mark );
1782 }
1783
1784 static void emit_noise2( struct brw_wm_compile *c,
1785 const struct prog_instruction *inst )
1786 {
1787 struct brw_compile *p = &c->func;
1788 struct brw_reg src0, src1, param0, param1, dst;
1789 GLuint mask = inst->DstReg.WriteMask;
1790 int i;
1791 int mark = mark_tmps( c );
1792
1793 assert( mark == 0 );
1794
1795 src0 = get_src_reg( c, inst, 0, 0 );
1796 src1 = get_src_reg( c, inst, 0, 1 );
1797
1798 param0 = alloc_tmp( c );
1799 param1 = alloc_tmp( c );
1800
1801 brw_MOV( p, param0, src0 );
1802 brw_MOV( p, param1, src1 );
1803
1804 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1805
1806 /* Fill in the result: */
1807 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1808 for (i = 0 ; i < 4; i++) {
1809 if (mask & (1<<i)) {
1810 dst = get_dst_reg(c, inst, i);
1811 brw_MOV( p, dst, param0 );
1812 }
1813 }
1814 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1815 brw_set_saturate( p, 0 );
1816
1817 release_tmps( c, mark );
1818 }
1819
1820 /**
1821 * The three-dimensional case is much like the one- and two- versions above,
1822 * but since the number of corners is rapidly growing we now pack 16 16-bit
1823 * hashes into each register to extract more parallelism from the EUs.
1824 */
1825 static void noise3_sub( struct brw_wm_compile *c ) {
1826
1827 struct brw_compile *p = &c->func;
1828 struct brw_reg param0, param1, param2,
1829 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1830 xi, yi, zi, /* interpolation coefficients */
1831 t, tmp[ 8 ], /* float temporaries */
1832 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1833 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1834 int i;
1835 int mark = mark_tmps( c );
1836
1837 x0y0 = alloc_tmp( c );
1838 x0y1 = alloc_tmp( c );
1839 x1y0 = alloc_tmp( c );
1840 x1y1 = alloc_tmp( c );
1841 xi = alloc_tmp( c );
1842 yi = alloc_tmp( c );
1843 zi = alloc_tmp( c );
1844 t = alloc_tmp( c );
1845 for( i = 0; i < 8; i++ ) {
1846 tmp[ i ] = alloc_tmp( c );
1847 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1848 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1849 }
1850
1851 param0 = lookup_tmp( c, mark - 4 );
1852 param1 = lookup_tmp( c, mark - 3 );
1853 param2 = lookup_tmp( c, mark - 2 );
1854
1855 brw_set_access_mode( p, BRW_ALIGN_1 );
1856
1857 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1858 be hashed. Also compute the remainders (offsets within the unit
1859 cube), interleaved to reduce register dependency penalties. */
1860 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1861 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1862 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1863 brw_FRC( p, param0, param0 );
1864 brw_FRC( p, param1, param1 );
1865 brw_FRC( p, param2, param2 );
1866 /* Since we now have only 16 bits of precision in the hash, we must
1867 be more careful about thorough mixing to maintain entropy as we
1868 squash the input vector into a small scalar. */
1869 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1870 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1871 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1872 brw_imm_uw( 0x9B93 ) );
1873 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1874 brw_imm_uw( 0xBC8F ) );
1875
1876 /* Temporarily disable the execution mask while we work with ExecSize=16
1877 channels (the mask is set for ExecSize=8 and is probably incorrect).
1878 Although this might cause execution of unwanted channels, the code
1879 writes only to temporary registers and has no side effects, so
1880 disabling the mask is harmless. */
1881 brw_push_insn_state( p );
1882 brw_set_mask_control( p, BRW_MASK_DISABLE );
1883 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1884 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1885 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1886
1887 /* We're now ready to perform the hashing. The eight hashes are
1888 interleaved for performance. The hash function used is
1889 designed to rapidly achieve avalanche and require only 16x16
1890 bit multiplication, and 8-bit swizzles (which we get for
1891 free). */
1892 for( i = 0; i < 4; i++ )
1893 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1894 for( i = 0; i < 4; i++ )
1895 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1896 odd_bytes( wtmp[ i ] ) );
1897 for( i = 0; i < 4; i++ )
1898 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1899 for( i = 0; i < 4; i++ )
1900 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1901 odd_bytes( wtmp[ i ] ) );
1902 brw_pop_insn_state( p );
1903
1904 /* Now we want to initialise the four rear gradients based on the
1905 hashes. Format conversion from signed integer to float leaves
1906 everything scaled too high by a factor of pow( 2, 15 ), but
1907 we correct for that right at the end. */
1908 /* x component */
1909 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1910 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1911 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1912 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1913 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1914
1915 brw_push_insn_state( p );
1916 brw_set_mask_control( p, BRW_MASK_DISABLE );
1917 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1918 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1919 brw_pop_insn_state( p );
1920
1921 brw_MUL( p, x1y0, x1y0, t );
1922 brw_MUL( p, x1y1, x1y1, t );
1923 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1924 brw_MUL( p, x0y0, x0y0, param0 );
1925 brw_MUL( p, x0y1, x0y1, param0 );
1926
1927 /* y component */
1928 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1929 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1930 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1931 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1932
1933 brw_push_insn_state( p );
1934 brw_set_mask_control( p, BRW_MASK_DISABLE );
1935 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1936 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1937 brw_pop_insn_state( p );
1938
1939 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1940 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1941 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1942 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1943 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1944
1945 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1946 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1947 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1948 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1949
1950 /* z component */
1951 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1952 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1953 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1954 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1955
1956 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1957 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1958 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1959 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1960
1961 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1962 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1963 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1964 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1965
1966 /* We interpolate between the gradients using the polynomial
1967 6t^5 - 15t^4 + 10t^3 (Perlin). */
1968 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1969 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1970 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1971 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1972 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1973 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1974 brw_MUL( p, xi, xi, param0 );
1975 brw_MUL( p, yi, yi, param1 );
1976 brw_MUL( p, zi, zi, param2 );
1977 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1978 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1979 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1980 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1981 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1982 brw_MUL( p, xi, xi, param0 );
1983 brw_MUL( p, yi, yi, param1 );
1984 brw_MUL( p, zi, zi, param2 );
1985 brw_MUL( p, xi, xi, param0 );
1986 brw_MUL( p, yi, yi, param1 );
1987 brw_MUL( p, zi, zi, param2 );
1988 brw_MUL( p, xi, xi, param0 );
1989 brw_MUL( p, yi, yi, param1 );
1990 brw_MUL( p, zi, zi, param2 );
1991
1992 /* Here we interpolate in the y dimension... */
1993 brw_MUL( p, x0y1, x0y1, yi );
1994 brw_MUL( p, x1y1, x1y1, yi );
1995 brw_ADD( p, x0y0, x0y0, x0y1 );
1996 brw_ADD( p, x1y0, x1y0, x1y1 );
1997
1998 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1999 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2000 brw_MUL( p, x1y0, x1y0, xi );
2001 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2002
2003 /* Now do the same thing for the front four gradients... */
2004 /* x component */
2005 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2006 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2007 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2008 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2009
2010 brw_push_insn_state( p );
2011 brw_set_mask_control( p, BRW_MASK_DISABLE );
2012 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2013 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2014 brw_pop_insn_state( p );
2015
2016 brw_MUL( p, x1y0, x1y0, t );
2017 brw_MUL( p, x1y1, x1y1, t );
2018 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2019 brw_MUL( p, x0y0, x0y0, param0 );
2020 brw_MUL( p, x0y1, x0y1, param0 );
2021
2022 /* y component */
2023 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2024 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2025 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2026 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2027
2028 brw_push_insn_state( p );
2029 brw_set_mask_control( p, BRW_MASK_DISABLE );
2030 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2031 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2032 brw_pop_insn_state( p );
2033
2034 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2035 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2036 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2037 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2038 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2039
2040 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2041 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2042 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2043 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2044
2045 /* z component */
2046 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2047 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2048 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2049 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2050
2051 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2052 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2053 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2054 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2055
2056 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2057 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2058 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2059 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2060
2061 /* The interpolation coefficients are still around from last time, so
2062 again interpolate in the y dimension... */
2063 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2064 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2065 brw_MUL( p, x0y1, x0y1, yi );
2066 brw_MUL( p, x1y1, x1y1, yi );
2067 brw_ADD( p, x0y0, x0y0, x0y1 );
2068 brw_ADD( p, x1y0, x1y0, x1y1 );
2069
2070 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2071 time put the front face in tmp[ 1 ] and we're nearly there... */
2072 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2073 brw_MUL( p, x1y0, x1y0, xi );
2074 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2075
2076 /* The final interpolation, in the z dimension: */
2077 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2078 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2079 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2080
2081 /* scale by pow( 2, -15 ), as described above */
2082 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2083
2084 release_tmps( c, mark );
2085 }
2086
2087 static void emit_noise3( struct brw_wm_compile *c,
2088 const struct prog_instruction *inst )
2089 {
2090 struct brw_compile *p = &c->func;
2091 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2092 GLuint mask = inst->DstReg.WriteMask;
2093 int i;
2094 int mark = mark_tmps( c );
2095
2096 assert( mark == 0 );
2097
2098 src0 = get_src_reg( c, inst, 0, 0 );
2099 src1 = get_src_reg( c, inst, 0, 1 );
2100 src2 = get_src_reg( c, inst, 0, 2 );
2101
2102 param0 = alloc_tmp( c );
2103 param1 = alloc_tmp( c );
2104 param2 = alloc_tmp( c );
2105
2106 brw_MOV( p, param0, src0 );
2107 brw_MOV( p, param1, src1 );
2108 brw_MOV( p, param2, src2 );
2109
2110 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2111
2112 /* Fill in the result: */
2113 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2114 for (i = 0 ; i < 4; i++) {
2115 if (mask & (1<<i)) {
2116 dst = get_dst_reg(c, inst, i);
2117 brw_MOV( p, dst, param0 );
2118 }
2119 }
2120 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2121 brw_set_saturate( p, 0 );
2122
2123 release_tmps( c, mark );
2124 }
2125
2126 /**
2127 * For the four-dimensional case, the little micro-optimisation benefits
2128 * we obtain by unrolling all the loops aren't worth the massive bloat it
2129 * now causes. Instead, we loop twice around performing a similar operation
2130 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2131 * code to glue it all together.
2132 */
2133 static void noise4_sub( struct brw_wm_compile *c )
2134 {
2135 struct brw_compile *p = &c->func;
2136 struct brw_reg param[ 4 ],
2137 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2138 w0, /* noise for the w=0 cube */
2139 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2140 interp[ 4 ], /* interpolation coefficients */
2141 t, tmp[ 8 ], /* float temporaries */
2142 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2143 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2144 int i, j;
2145 int mark = mark_tmps( c );
2146 GLuint loop, origin;
2147
2148 x0y0 = alloc_tmp( c );
2149 x0y1 = alloc_tmp( c );
2150 x1y0 = alloc_tmp( c );
2151 x1y1 = alloc_tmp( c );
2152 t = alloc_tmp( c );
2153 w0 = alloc_tmp( c );
2154 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2155 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2156
2157 for( i = 0; i < 4; i++ ) {
2158 param[ i ] = lookup_tmp( c, mark - 5 + i );
2159 interp[ i ] = alloc_tmp( c );
2160 }
2161
2162 for( i = 0; i < 8; i++ ) {
2163 tmp[ i ] = alloc_tmp( c );
2164 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2165 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2166 }
2167
2168 brw_set_access_mode( p, BRW_ALIGN_1 );
2169
2170 /* We only want 16 bits of precision from the integral part of each
2171 co-ordinate, but unfortunately the RNDD semantics would saturate
2172 at 16 bits if we performed the operation directly to a 16-bit
2173 destination. Therefore, we round to 32-bit temporaries where
2174 appropriate, and then store only the lower 16 bits. */
2175 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2176 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2177 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2178 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2179 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2180 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2181
2182 /* Modify the flag register here, because the side effect is useful
2183 later (see below). We know for certain that all flags will be
2184 cleared, since the FRC instruction cannot possibly generate
2185 negative results. Even for exceptional inputs (infinities, denormals,
2186 NaNs), the architecture guarantees that the L conditional is false. */
2187 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2188 brw_FRC( p, param[ 0 ], param[ 0 ] );
2189 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2190 for( i = 1; i < 4; i++ )
2191 brw_FRC( p, param[ i ], param[ i ] );
2192
2193 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2194 of all. */
2195 for( i = 0; i < 4; i++ )
2196 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2197 for( i = 0; i < 4; i++ )
2198 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2199 for( i = 0; i < 4; i++ )
2200 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2201 for( i = 0; i < 4; i++ )
2202 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2203 for( j = 0; j < 3; j++ )
2204 for( i = 0; i < 4; i++ )
2205 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2206
2207 /* Mark the current address, as it will be a jump destination. The
2208 following code will be executed twice: first, with the flag
2209 register clear indicating the w=0 case, and second with flags
2210 set for w=1. */
2211 loop = p->nr_insn;
2212
2213 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2214 be hashed. Since we have only 16 bits of precision in the hash, we
2215 must be careful about thorough mixing to maintain entropy as we
2216 squash the input vector into a small scalar. */
2217 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2218 brw_imm_uw( 0xBC8F ) );
2219 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2220 brw_imm_uw( 0xD0BD ) );
2221 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2222 brw_imm_uw( 0x9B93 ) );
2223 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2224 brw_imm_uw( 0xA359 ) );
2225 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2226 brw_imm_uw( 0xBC8F ) );
2227
2228 /* Temporarily disable the execution mask while we work with ExecSize=16
2229 channels (the mask is set for ExecSize=8 and is probably incorrect).
2230 Although this might cause execution of unwanted channels, the code
2231 writes only to temporary registers and has no side effects, so
2232 disabling the mask is harmless. */
2233 brw_push_insn_state( p );
2234 brw_set_mask_control( p, BRW_MASK_DISABLE );
2235 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2236 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2237 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2238
2239 /* We're now ready to perform the hashing. The eight hashes are
2240 interleaved for performance. The hash function used is
2241 designed to rapidly achieve avalanche and require only 16x16
2242 bit multiplication, and 8-bit swizzles (which we get for
2243 free). */
2244 for( i = 0; i < 4; i++ )
2245 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2246 for( i = 0; i < 4; i++ )
2247 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2248 odd_bytes( wtmp[ i ] ) );
2249 for( i = 0; i < 4; i++ )
2250 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2251 for( i = 0; i < 4; i++ )
2252 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2253 odd_bytes( wtmp[ i ] ) );
2254 brw_pop_insn_state( p );
2255
2256 /* Now we want to initialise the four rear gradients based on the
2257 hashes. Format conversion from signed integer to float leaves
2258 everything scaled too high by a factor of pow( 2, 15 ), but
2259 we correct for that right at the end. */
2260 /* x component */
2261 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2262 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2263 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2264 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2265 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2266
2267 brw_push_insn_state( p );
2268 brw_set_mask_control( p, BRW_MASK_DISABLE );
2269 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2270 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2271 brw_pop_insn_state( p );
2272
2273 brw_MUL( p, x1y0, x1y0, t );
2274 brw_MUL( p, x1y1, x1y1, t );
2275 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2276 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2277 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2278
2279 /* y component */
2280 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2281 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2282 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2283 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2284
2285 brw_push_insn_state( p );
2286 brw_set_mask_control( p, BRW_MASK_DISABLE );
2287 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2288 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2289 brw_pop_insn_state( p );
2290
2291 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2292 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2293 /* prepare t for the w component (used below): w the first time through
2294 the loop; w - 1 the second time) */
2295 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2296 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2297 p->current->header.predicate_inverse = 1;
2298 brw_MOV( p, t, param[ 3 ] );
2299 p->current->header.predicate_inverse = 0;
2300 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2301 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2302 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2303
2304 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2305 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2306 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2307 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2308
2309 /* z component */
2310 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2311 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2312 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2313 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2314
2315 brw_push_insn_state( p );
2316 brw_set_mask_control( p, BRW_MASK_DISABLE );
2317 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2318 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2319 brw_pop_insn_state( p );
2320
2321 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2322 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2323 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2324 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2325
2326 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2327 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2328 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2329 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2330
2331 /* w component */
2332 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2333 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2334 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2335 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2336
2337 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2338 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2339 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2340 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2341 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2342
2343 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2344 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2345 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2346 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2347
2348 /* Here we interpolate in the y dimension... */
2349 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2350 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2351 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2352 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2353 brw_ADD( p, x0y0, x0y0, x0y1 );
2354 brw_ADD( p, x1y0, x1y0, x1y1 );
2355
2356 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2357 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2358 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2359 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2360
2361 /* Now do the same thing for the front four gradients... */
2362 /* x component */
2363 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2364 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2365 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2366 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2367
2368 brw_push_insn_state( p );
2369 brw_set_mask_control( p, BRW_MASK_DISABLE );
2370 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2371 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2372 brw_pop_insn_state( p );
2373
2374 brw_MUL( p, x1y0, x1y0, t );
2375 brw_MUL( p, x1y1, x1y1, t );
2376 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2377 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2378 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2379
2380 /* y component */
2381 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2382 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2383 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2384 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2385
2386 brw_push_insn_state( p );
2387 brw_set_mask_control( p, BRW_MASK_DISABLE );
2388 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2389 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2390 brw_pop_insn_state( p );
2391
2392 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2393 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2394 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2395 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2396 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2397
2398 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2399 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2400 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2401 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2402
2403 /* z component */
2404 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2405 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2406 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2407 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2408
2409 brw_push_insn_state( p );
2410 brw_set_mask_control( p, BRW_MASK_DISABLE );
2411 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2412 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2413 brw_pop_insn_state( p );
2414
2415 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2416 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2417 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2418 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2419 /* prepare t for the w component (used below): w the first time through
2420 the loop; w - 1 the second time) */
2421 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2422 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2423 p->current->header.predicate_inverse = 1;
2424 brw_MOV( p, t, param[ 3 ] );
2425 p->current->header.predicate_inverse = 0;
2426 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2427
2428 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2429 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2430 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2431 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2432
2433 /* w component */
2434 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2435 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2436 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2437 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2438
2439 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2440 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2441 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2442 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2443
2444 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2445 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2446 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2447 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2448
2449 /* Interpolate in the y dimension: */
2450 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2451 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2452 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2453 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2454 brw_ADD( p, x0y0, x0y0, x0y1 );
2455 brw_ADD( p, x1y0, x1y0, x1y1 );
2456
2457 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2458 time put the front face in tmp[ 1 ] and we're nearly there... */
2459 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2460 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2461 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2462
2463 /* Another interpolation, in the z dimension: */
2464 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2465 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2466 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2467
2468 /* Exit the loop if we've computed both cubes... */
2469 origin = p->nr_insn;
2470 brw_push_insn_state( p );
2471 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2472 brw_set_mask_control( p, BRW_MASK_DISABLE );
2473 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2474 brw_pop_insn_state( p );
2475
2476 /* Save the result for the w=0 case, and increment the w coordinate: */
2477 brw_MOV( p, w0, tmp[ 0 ] );
2478 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2479 brw_imm_uw( 1 ) );
2480
2481 /* Loop around for the other cube. Explicitly set the flag register
2482 (unfortunately we must spend an extra instruction to do this: we
2483 can't rely on a side effect of the previous MOV or ADD because
2484 conditional modifiers which are normally true might be false in
2485 exceptional circumstances, e.g. given a NaN input; the add to
2486 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2487 brw_push_insn_state( p );
2488 brw_set_mask_control( p, BRW_MASK_DISABLE );
2489 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2490 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2491 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2492 brw_pop_insn_state( p );
2493
2494 /* Patch the previous conditional branch now that we know the
2495 destination address. */
2496 brw_set_src1( p->store + origin,
2497 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2498
2499 /* The very last interpolation. */
2500 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2501 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2502 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2503
2504 /* scale by pow( 2, -15 ), as described above */
2505 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2506
2507 release_tmps( c, mark );
2508 }
2509
2510 static void emit_noise4( struct brw_wm_compile *c,
2511 const struct prog_instruction *inst )
2512 {
2513 struct brw_compile *p = &c->func;
2514 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2515 GLuint mask = inst->DstReg.WriteMask;
2516 int i;
2517 int mark = mark_tmps( c );
2518
2519 assert( mark == 0 );
2520
2521 src0 = get_src_reg( c, inst, 0, 0 );
2522 src1 = get_src_reg( c, inst, 0, 1 );
2523 src2 = get_src_reg( c, inst, 0, 2 );
2524 src3 = get_src_reg( c, inst, 0, 3 );
2525
2526 param0 = alloc_tmp( c );
2527 param1 = alloc_tmp( c );
2528 param2 = alloc_tmp( c );
2529 param3 = alloc_tmp( c );
2530
2531 brw_MOV( p, param0, src0 );
2532 brw_MOV( p, param1, src1 );
2533 brw_MOV( p, param2, src2 );
2534 brw_MOV( p, param3, src3 );
2535
2536 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2537
2538 /* Fill in the result: */
2539 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2540 for (i = 0 ; i < 4; i++) {
2541 if (mask & (1<<i)) {
2542 dst = get_dst_reg(c, inst, i);
2543 brw_MOV( p, dst, param0 );
2544 }
2545 }
2546 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2547 brw_set_saturate( p, 0 );
2548
2549 release_tmps( c, mark );
2550 }
2551
2552 static void emit_wpos_xy(struct brw_wm_compile *c,
2553 const struct prog_instruction *inst)
2554 {
2555 struct brw_compile *p = &c->func;
2556 GLuint mask = inst->DstReg.WriteMask;
2557 struct brw_reg src0[2], dst[2];
2558
2559 dst[0] = get_dst_reg(c, inst, 0);
2560 dst[1] = get_dst_reg(c, inst, 1);
2561
2562 src0[0] = get_src_reg(c, inst, 0, 0);
2563 src0[1] = get_src_reg(c, inst, 0, 1);
2564
2565 /* Calculate the pixel offset from window bottom left into destination
2566 * X and Y channels.
2567 */
2568 if (mask & WRITEMASK_X) {
2569 /* X' = X - origin_x */
2570 brw_ADD(p,
2571 dst[0],
2572 retype(src0[0], BRW_REGISTER_TYPE_W),
2573 brw_imm_d(0 - c->key.origin_x));
2574 }
2575
2576 if (mask & WRITEMASK_Y) {
2577 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2578 brw_ADD(p,
2579 dst[1],
2580 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2581 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2582 }
2583 }
2584
2585 /* TODO
2586 BIAS on SIMD8 not working yet...
2587 */
2588 static void emit_txb(struct brw_wm_compile *c,
2589 const struct prog_instruction *inst)
2590 {
2591 struct brw_compile *p = &c->func;
2592 struct brw_reg dst[4], src[4], payload_reg;
2593 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2594 GLuint i;
2595
2596 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2597
2598 for (i = 0; i < 4; i++)
2599 dst[i] = get_dst_reg(c, inst, i);
2600 for (i = 0; i < 4; i++)
2601 src[i] = get_src_reg(c, inst, 0, i);
2602
2603 switch (inst->TexSrcTarget) {
2604 case TEXTURE_1D_INDEX:
2605 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2606 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2607 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2608 break;
2609 case TEXTURE_2D_INDEX:
2610 case TEXTURE_RECT_INDEX:
2611 brw_MOV(p, brw_message_reg(2), src[0]);
2612 brw_MOV(p, brw_message_reg(3), src[1]);
2613 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2614 break;
2615 default:
2616 brw_MOV(p, brw_message_reg(2), src[0]);
2617 brw_MOV(p, brw_message_reg(3), src[1]);
2618 brw_MOV(p, brw_message_reg(4), src[2]);
2619 break;
2620 }
2621 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2622 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2623 brw_SAMPLE(p,
2624 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2625 1, /* msg_reg_nr */
2626 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2627 SURF_INDEX_TEXTURE(unit),
2628 unit, /* sampler */
2629 inst->DstReg.WriteMask, /* writemask */
2630 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2631 4, /* response_length */
2632 4, /* msg_length */
2633 0); /* eot */
2634 }
2635
2636
2637 static void emit_tex(struct brw_wm_compile *c,
2638 const struct prog_instruction *inst)
2639 {
2640 struct brw_compile *p = &c->func;
2641 struct brw_reg dst[4], src[4], payload_reg;
2642 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2643 GLuint msg_len;
2644 GLuint i, nr;
2645 GLuint emit;
2646 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2647
2648 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2649
2650 for (i = 0; i < 4; i++)
2651 dst[i] = get_dst_reg(c, inst, i);
2652 for (i = 0; i < 4; i++)
2653 src[i] = get_src_reg(c, inst, 0, i);
2654
2655 switch (inst->TexSrcTarget) {
2656 case TEXTURE_1D_INDEX:
2657 emit = WRITEMASK_X;
2658 nr = 1;
2659 break;
2660 case TEXTURE_2D_INDEX:
2661 case TEXTURE_RECT_INDEX:
2662 emit = WRITEMASK_XY;
2663 nr = 2;
2664 break;
2665 default:
2666 emit = WRITEMASK_XYZ;
2667 nr = 3;
2668 break;
2669 }
2670 msg_len = 1;
2671
2672 /* move/load S, T, R coords */
2673 for (i = 0; i < nr; i++) {
2674 static const GLuint swz[4] = {0,1,2,2};
2675 if (emit & (1<<i))
2676 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2677 else
2678 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2679 msg_len += 1;
2680 }
2681
2682 if (shadow) {
2683 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2684 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2685 }
2686
2687 brw_SAMPLE(p,
2688 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2689 1, /* msg_reg_nr */
2690 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2691 SURF_INDEX_TEXTURE(unit),
2692 unit, /* sampler */
2693 inst->DstReg.WriteMask, /* writemask */
2694 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2695 4, /* response_length */
2696 shadow ? 6 : 4, /* msg_length */
2697 0); /* eot */
2698
2699 if (shadow)
2700 brw_MOV(p, dst[3], brw_imm_f(1.0));
2701 }
2702
2703
2704 /**
2705 * Resolve subroutine calls after code emit is done.
2706 */
2707 static void post_wm_emit( struct brw_wm_compile *c )
2708 {
2709 brw_resolve_cals(&c->func);
2710 }
2711
2712 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2713 {
2714 #define MAX_IFSN 32
2715 #define MAX_LOOP_DEPTH 32
2716 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2717 struct brw_instruction *inst0, *inst1;
2718 int i, if_insn = 0, loop_insn = 0;
2719 struct brw_compile *p = &c->func;
2720 struct brw_indirect stack_index = brw_indirect(0, 0);
2721
2722 prealloc_reg(c);
2723 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2724 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2725
2726 for (i = 0; i < c->nr_fp_insns; i++) {
2727 const struct prog_instruction *inst = &c->prog_instructions[i];
2728
2729 c->cur_inst = i;
2730
2731 #if 0
2732 _mesa_printf("Inst %d: ", i);
2733 _mesa_print_instruction(inst);
2734 #endif
2735
2736 /* fetch any constants that this instruction needs */
2737 if (c->fp->use_const_buffer)
2738 fetch_constants(c, inst);
2739
2740 if (inst->CondUpdate)
2741 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2742 else
2743 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2744
2745 switch (inst->Opcode) {
2746 case WM_PIXELXY:
2747 emit_pixel_xy(c, inst);
2748 break;
2749 case WM_DELTAXY:
2750 emit_delta_xy(c, inst);
2751 break;
2752 case WM_PIXELW:
2753 emit_pixel_w(c, inst);
2754 break;
2755 case WM_LINTERP:
2756 emit_linterp(c, inst);
2757 break;
2758 case WM_PINTERP:
2759 emit_pinterp(c, inst);
2760 break;
2761 case WM_CINTERP:
2762 emit_cinterp(c, inst);
2763 break;
2764 case WM_WPOSXY:
2765 emit_wpos_xy(c, inst);
2766 break;
2767 case WM_FB_WRITE:
2768 emit_fb_write(c, inst);
2769 break;
2770 case WM_FRONTFACING:
2771 emit_frontfacing(c, inst);
2772 break;
2773 case OPCODE_ABS:
2774 emit_abs(c, inst);
2775 break;
2776 case OPCODE_ADD:
2777 emit_add(c, inst);
2778 break;
2779 case OPCODE_ARL:
2780 emit_arl(c, inst);
2781 break;
2782 case OPCODE_SUB:
2783 emit_sub(c, inst);
2784 break;
2785 case OPCODE_FRC:
2786 emit_frc(c, inst);
2787 break;
2788 case OPCODE_FLR:
2789 emit_flr(c, inst);
2790 break;
2791 case OPCODE_LRP:
2792 emit_lrp(c, inst);
2793 break;
2794 case OPCODE_TRUNC:
2795 emit_trunc(c, inst);
2796 break;
2797 case OPCODE_MOV:
2798 emit_mov(c, inst);
2799 break;
2800 case OPCODE_DP3:
2801 emit_dp3(c, inst);
2802 break;
2803 case OPCODE_DP4:
2804 emit_dp4(c, inst);
2805 break;
2806 case OPCODE_XPD:
2807 emit_xpd(c, inst);
2808 break;
2809 case OPCODE_DPH:
2810 emit_dph(c, inst);
2811 break;
2812 case OPCODE_RCP:
2813 emit_rcp(c, inst);
2814 break;
2815 case OPCODE_RSQ:
2816 emit_rsq(c, inst);
2817 break;
2818 case OPCODE_SIN:
2819 emit_sin(c, inst);
2820 break;
2821 case OPCODE_COS:
2822 emit_cos(c, inst);
2823 break;
2824 case OPCODE_EX2:
2825 emit_ex2(c, inst);
2826 break;
2827 case OPCODE_LG2:
2828 emit_lg2(c, inst);
2829 break;
2830 case OPCODE_MIN:
2831 case OPCODE_MAX:
2832 emit_min_max(c, inst);
2833 break;
2834 case OPCODE_DDX:
2835 emit_ddx(c, inst);
2836 break;
2837 case OPCODE_DDY:
2838 emit_ddy(c, inst);
2839 break;
2840 case OPCODE_SLT:
2841 emit_slt(c, inst);
2842 break;
2843 case OPCODE_SLE:
2844 emit_sle(c, inst);
2845 break;
2846 case OPCODE_SGT:
2847 emit_sgt(c, inst);
2848 break;
2849 case OPCODE_SGE:
2850 emit_sge(c, inst);
2851 break;
2852 case OPCODE_SEQ:
2853 emit_seq(c, inst);
2854 break;
2855 case OPCODE_SNE:
2856 emit_sne(c, inst);
2857 break;
2858 case OPCODE_MUL:
2859 emit_mul(c, inst);
2860 break;
2861 case OPCODE_POW:
2862 emit_pow(c, inst);
2863 break;
2864 case OPCODE_MAD:
2865 emit_mad(c, inst);
2866 break;
2867 case OPCODE_NOISE1:
2868 emit_noise1(c, inst);
2869 break;
2870 case OPCODE_NOISE2:
2871 emit_noise2(c, inst);
2872 break;
2873 case OPCODE_NOISE3:
2874 emit_noise3(c, inst);
2875 break;
2876 case OPCODE_NOISE4:
2877 emit_noise4(c, inst);
2878 break;
2879 case OPCODE_TEX:
2880 emit_tex(c, inst);
2881 break;
2882 case OPCODE_TXB:
2883 emit_txb(c, inst);
2884 break;
2885 case OPCODE_KIL_NV:
2886 emit_kil(c);
2887 break;
2888 case OPCODE_IF:
2889 assert(if_insn < MAX_IFSN);
2890 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2891 break;
2892 case OPCODE_ELSE:
2893 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2894 break;
2895 case OPCODE_ENDIF:
2896 assert(if_insn > 0);
2897 brw_ENDIF(p, if_inst[--if_insn]);
2898 break;
2899 case OPCODE_BGNSUB:
2900 brw_save_label(p, inst->Comment, p->nr_insn);
2901 break;
2902 case OPCODE_ENDSUB:
2903 /* no-op */
2904 break;
2905 case OPCODE_CAL:
2906 brw_push_insn_state(p);
2907 brw_set_mask_control(p, BRW_MASK_DISABLE);
2908 brw_set_access_mode(p, BRW_ALIGN_1);
2909 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2910 brw_set_access_mode(p, BRW_ALIGN_16);
2911 brw_ADD(p, get_addr_reg(stack_index),
2912 get_addr_reg(stack_index), brw_imm_d(4));
2913 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2914 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2915 brw_pop_insn_state(p);
2916 break;
2917
2918 case OPCODE_RET:
2919 brw_push_insn_state(p);
2920 brw_set_mask_control(p, BRW_MASK_DISABLE);
2921 brw_ADD(p, get_addr_reg(stack_index),
2922 get_addr_reg(stack_index), brw_imm_d(-4));
2923 brw_set_access_mode(p, BRW_ALIGN_1);
2924 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2925 brw_set_access_mode(p, BRW_ALIGN_16);
2926 brw_pop_insn_state(p);
2927
2928 break;
2929 case OPCODE_BGNLOOP:
2930 /* XXX may need to invalidate the current_constant regs */
2931 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2932 break;
2933 case OPCODE_BRK:
2934 brw_BREAK(p);
2935 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2936 break;
2937 case OPCODE_CONT:
2938 brw_CONT(p);
2939 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2940 break;
2941 case OPCODE_ENDLOOP:
2942 loop_insn--;
2943 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2944 /* patch all the BREAK instructions from
2945 last BEGINLOOP */
2946 while (inst0 > loop_inst[loop_insn]) {
2947 inst0--;
2948 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2949 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2950 inst0->bits3.if_else.pop_count = 0;
2951 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2952 inst0->bits3.if_else.jump_count = inst1 - inst0;
2953 inst0->bits3.if_else.pop_count = 0;
2954 }
2955 }
2956 break;
2957 default:
2958 _mesa_printf("unsupported IR in fragment shader %d\n",
2959 inst->Opcode);
2960 }
2961
2962 if (inst->CondUpdate)
2963 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2964 else
2965 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2966 }
2967 post_wm_emit(c);
2968 }
2969
2970
2971 /**
2972 * Do GPU code generation for shaders that use GLSL features such as
2973 * flow control. Other shaders will be compiled with the
2974 */
2975 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2976 {
2977 if (INTEL_DEBUG & DEBUG_WM) {
2978 _mesa_printf("brw_wm_glsl_emit:\n");
2979 }
2980
2981 /* initial instruction translation/simplification */
2982 brw_wm_pass_fp(c);
2983
2984 /* actual code generation */
2985 brw_wm_emit_glsl(brw, c);
2986
2987 if (INTEL_DEBUG & DEBUG_WM) {
2988 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2989 }
2990
2991 c->prog_data.total_grf = num_grf_used(c);
2992 c->prog_data.total_scratch = 0;
2993 }