7ff6125dcac27972b982d37d2b3788a9f2171319
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13
14 /**
15 * Determine if the given fragment program uses GLSL features such
16 * as flow conditionals, loops, subroutines.
17 * Some GLSL shaders may use these features, others might not.
18 */
19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
20 {
21 int i;
22 for (i = 0; i < fp->Base.NumInstructions; i++) {
23 const struct prog_instruction *inst = &fp->Base.Instructions[i];
24 switch (inst->Opcode) {
25 case OPCODE_ARL:
26 case OPCODE_IF:
27 case OPCODE_ENDIF:
28 case OPCODE_CAL:
29 case OPCODE_BRK:
30 case OPCODE_RET:
31 case OPCODE_DDX:
32 case OPCODE_DDY:
33 case OPCODE_NOISE1:
34 case OPCODE_NOISE2:
35 case OPCODE_NOISE3:
36 case OPCODE_NOISE4:
37 case OPCODE_BGNLOOP:
38 return GL_TRUE;
39 default:
40 break;
41 }
42 }
43 return GL_FALSE;
44 }
45
46
47
48 static void
49 reclaim_temps(struct brw_wm_compile *c);
50
51
52 /** Mark GRF register as used. */
53 static void
54 prealloc_grf(struct brw_wm_compile *c, int r)
55 {
56 c->used_grf[r] = GL_TRUE;
57 }
58
59
60 /** Mark given GRF register as not in use. */
61 static void
62 release_grf(struct brw_wm_compile *c, int r)
63 {
64 /*assert(c->used_grf[r]);*/
65 c->used_grf[r] = GL_FALSE;
66 c->first_free_grf = MIN2(c->first_free_grf, r);
67 }
68
69
70 /** Return index of a free GRF, mark it as used. */
71 static int
72 alloc_grf(struct brw_wm_compile *c)
73 {
74 GLuint r;
75 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
76 if (!c->used_grf[r]) {
77 c->used_grf[r] = GL_TRUE;
78 c->first_free_grf = r + 1; /* a guess */
79 return r;
80 }
81 }
82
83 /* no free temps, try to reclaim some */
84 reclaim_temps(c);
85 c->first_free_grf = 0;
86
87 /* try alloc again */
88 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
89 if (!c->used_grf[r]) {
90 c->used_grf[r] = GL_TRUE;
91 c->first_free_grf = r + 1; /* a guess */
92 return r;
93 }
94 }
95
96 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
97 assert(c->used_grf[r]);
98 }
99
100 /* really, no free GRF regs found */
101 if (!c->out_of_regs) {
102 /* print warning once per compilation */
103 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
104 c->out_of_regs = GL_TRUE;
105 }
106
107 return -1;
108 }
109
110
111 /** Return number of GRF registers used */
112 static int
113 num_grf_used(const struct brw_wm_compile *c)
114 {
115 int r;
116 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
117 if (c->used_grf[r])
118 return r + 1;
119 return 0;
120 }
121
122
123
124 /**
125 * Record the mapping of a Mesa register to a hardware register.
126 */
127 static void set_reg(struct brw_wm_compile *c, int file, int index,
128 int component, struct brw_reg reg)
129 {
130 c->wm_regs[file][index][component].reg = reg;
131 c->wm_regs[file][index][component].inited = GL_TRUE;
132 }
133
134 /**
135 * Examine instruction's write mask to find index of first component
136 * enabled for writing.
137 */
138 static int get_scalar_dst_index(const struct prog_instruction *inst)
139 {
140 int i;
141 for (i = 0; i < 4; i++)
142 if (inst->DstReg.WriteMask & (1<<i))
143 break;
144 return i;
145 }
146
147 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
148 {
149 struct brw_reg reg;
150
151 /* if we need to allocate another temp, grow the tmp_regs[] array */
152 if (c->tmp_index == c->tmp_max) {
153 int r = alloc_grf(c);
154 if (r < 0) {
155 /*printf("Out of temps in %s\n", __FUNCTION__);*/
156 r = 50; /* XXX random register! */
157 }
158 c->tmp_regs[ c->tmp_max++ ] = r;
159 }
160
161 /* form the GRF register */
162 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
163 /*printf("alloc_temp %d\n", reg.nr);*/
164 assert(reg.nr < BRW_WM_MAX_GRF);
165 return reg;
166
167 }
168
169 /**
170 * Save current temp register info.
171 * There must be a matching call to release_tmps().
172 */
173 static int mark_tmps(struct brw_wm_compile *c)
174 {
175 return c->tmp_index;
176 }
177
178 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
179 {
180 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
181 }
182
183 static void release_tmps(struct brw_wm_compile *c, int mark)
184 {
185 c->tmp_index = mark;
186 }
187
188 /**
189 * Convert Mesa src register to brw register.
190 *
191 * Since we're running in SOA mode each Mesa register corresponds to four
192 * hardware registers. We allocate the hardware registers as needed here.
193 *
194 * \param file register file, one of PROGRAM_x
195 * \param index register number
196 * \param component src component (X=0, Y=1, Z=2, W=3)
197 * \param nr not used?!?
198 * \param neg negate value?
199 * \param abs take absolute value?
200 */
201 static struct brw_reg
202 get_reg(struct brw_wm_compile *c, int file, int index, int component,
203 int nr, GLuint neg, GLuint abs)
204 {
205 struct brw_reg reg;
206 switch (file) {
207 case PROGRAM_STATE_VAR:
208 case PROGRAM_CONSTANT:
209 case PROGRAM_UNIFORM:
210 file = PROGRAM_STATE_VAR;
211 break;
212 case PROGRAM_UNDEFINED:
213 return brw_null_reg();
214 case PROGRAM_TEMPORARY:
215 case PROGRAM_INPUT:
216 case PROGRAM_OUTPUT:
217 case PROGRAM_PAYLOAD:
218 break;
219 default:
220 _mesa_problem(NULL, "Unexpected file in get_reg()");
221 return brw_null_reg();
222 }
223
224 assert(index < 256);
225 assert(component < 4);
226
227 /* see if we've already allocated a HW register for this Mesa register */
228 if (c->wm_regs[file][index][component].inited) {
229 /* yes, re-use */
230 reg = c->wm_regs[file][index][component].reg;
231 }
232 else {
233 /* no, allocate new register */
234 int grf = alloc_grf(c);
235 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
236 if (grf < 0) {
237 /* totally out of temps */
238 grf = 51; /* XXX random register! */
239 }
240
241 reg = brw_vec8_grf(grf, 0);
242 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
243
244 set_reg(c, file, index, component, reg);
245 }
246
247 if (neg & (1 << component)) {
248 reg = negate(reg);
249 }
250 if (abs)
251 reg = brw_abs(reg);
252 return reg;
253 }
254
255
256
257 /**
258 * This is called if we run out of GRF registers. Examine the live intervals
259 * of temp regs in the program and free those which won't be used again.
260 */
261 static void
262 reclaim_temps(struct brw_wm_compile *c)
263 {
264 GLint intBegin[MAX_PROGRAM_TEMPS];
265 GLint intEnd[MAX_PROGRAM_TEMPS];
266 int index;
267
268 /*printf("Reclaim temps:\n");*/
269
270 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
271 intBegin, intEnd);
272
273 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
274 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
275 /* program temp[i] can be freed */
276 int component;
277 /*printf(" temp[%d] is dead\n", index);*/
278 for (component = 0; component < 4; component++) {
279 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
280 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
281 release_grf(c, r);
282 /*
283 printf(" Reclaim temp %d, reg %d at inst %d\n",
284 index, r, c->cur_inst);
285 */
286 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
287 }
288 }
289 }
290 }
291 }
292
293
294
295
296 /**
297 * Preallocate registers. This sets up the Mesa to hardware register
298 * mapping for certain registers, such as constants (uniforms/state vars)
299 * and shader inputs.
300 */
301 static void prealloc_reg(struct brw_wm_compile *c)
302 {
303 int i, j;
304 struct brw_reg reg;
305 int urb_read_length = 0;
306 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
307 GLuint reg_index = 0;
308
309 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
310 c->first_free_grf = 0;
311
312 for (i = 0; i < 4; i++) {
313 if (i < c->key.nr_depth_regs)
314 reg = brw_vec8_grf(i * 2, 0);
315 else
316 reg = brw_vec8_grf(0, 0);
317 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
318 }
319 reg_index += 2 * c->key.nr_depth_regs;
320
321 /* constants */
322 {
323 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
324 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
325
326 /* use a real constant buffer, or just use a section of the GRF? */
327 /* XXX this heuristic may need adjustment... */
328 if ((nr_params + nr_temps) * 4 + reg_index > 80)
329 c->fp->use_const_buffer = GL_TRUE;
330 else
331 c->fp->use_const_buffer = GL_FALSE;
332 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
333
334 if (c->fp->use_const_buffer) {
335 /* We'll use a real constant buffer and fetch constants from
336 * it with a dataport read message.
337 */
338
339 /* number of float constants in CURBE */
340 c->prog_data.nr_params = 0;
341 }
342 else {
343 const struct gl_program_parameter_list *plist =
344 c->fp->program.Base.Parameters;
345 int index = 0;
346
347 /* number of float constants in CURBE */
348 c->prog_data.nr_params = 4 * nr_params;
349
350 /* loop over program constants (float[4]) */
351 for (i = 0; i < nr_params; i++) {
352 /* loop over XYZW channels */
353 for (j = 0; j < 4; j++, index++) {
354 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
355 /* Save pointer to parameter/constant value.
356 * Constants will be copied in prepare_constant_buffer()
357 */
358 c->prog_data.param[index] = &plist->ParameterValues[i][j];
359 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
360 }
361 }
362 /* number of constant regs used (each reg is float[8]) */
363 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
364 reg_index += c->nr_creg;
365 }
366 }
367
368 /* fragment shader inputs */
369 for (i = 0; i < VERT_RESULT_MAX; i++) {
370 int fp_input;
371
372 if (i >= VERT_RESULT_VAR0)
373 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
374 else if (i <= VERT_RESULT_TEX7)
375 fp_input = i;
376 else
377 fp_input = -1;
378
379 if (fp_input >= 0 && inputs & (1 << fp_input)) {
380 urb_read_length = reg_index;
381 reg = brw_vec8_grf(reg_index, 0);
382 for (j = 0; j < 4; j++)
383 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
384 }
385 if (c->key.vp_outputs_written & (1 << i)) {
386 reg_index += 2;
387 }
388 }
389
390 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
391 c->prog_data.urb_read_length = urb_read_length;
392 c->prog_data.curb_read_length = c->nr_creg;
393 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
394 reg_index++;
395 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
396 reg_index += 2;
397
398 /* mark GRF regs [0..reg_index-1] as in-use */
399 for (i = 0; i < reg_index; i++)
400 prealloc_grf(c, i);
401
402 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
403 prealloc_grf(c, 126);
404 prealloc_grf(c, 127);
405
406 /* An instruction may reference up to three constants.
407 * They'll be found in these registers.
408 * XXX alloc these on demand!
409 */
410 if (c->fp->use_const_buffer) {
411 for (i = 0; i < 3; i++) {
412 c->current_const[i].index = -1;
413 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
414 }
415 }
416 #if 0
417 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
418 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
419 #endif
420 }
421
422
423 /**
424 * Check if any of the instruction's src registers are constants, uniforms,
425 * or statevars. If so, fetch any constants that we don't already have in
426 * the three GRF slots.
427 */
428 static void fetch_constants(struct brw_wm_compile *c,
429 const struct prog_instruction *inst)
430 {
431 struct brw_compile *p = &c->func;
432 GLuint i;
433
434 /* loop over instruction src regs */
435 for (i = 0; i < 3; i++) {
436 const struct prog_src_register *src = &inst->SrcReg[i];
437 if (src->File == PROGRAM_STATE_VAR ||
438 src->File == PROGRAM_CONSTANT ||
439 src->File == PROGRAM_UNIFORM) {
440 c->current_const[i].index = src->Index;
441
442 #if 0
443 printf(" fetch const[%d] for arg %d into reg %d\n",
444 src->Index, i, c->current_const[i].reg.nr);
445 #endif
446
447 /* need to fetch the constant now */
448 brw_dp_READ_4(p,
449 c->current_const[i].reg, /* writeback dest */
450 src->RelAddr, /* relative indexing? */
451 16 * src->Index, /* byte offset */
452 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
453 );
454 }
455 }
456 }
457
458
459 /**
460 * Convert Mesa dst register to brw register.
461 */
462 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
463 const struct prog_instruction *inst,
464 GLuint component)
465 {
466 const int nr = 1;
467 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
468 0, 0);
469 }
470
471
472 static struct brw_reg
473 get_src_reg_const(struct brw_wm_compile *c,
474 const struct prog_instruction *inst,
475 GLuint srcRegIndex, GLuint component)
476 {
477 /* We should have already fetched the constant from the constant
478 * buffer in fetch_constants(). Now we just have to return a
479 * register description that extracts the needed component and
480 * smears it across all eight vector components.
481 */
482 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
483 struct brw_reg const_reg;
484
485 assert(component < 4);
486 assert(srcRegIndex < 3);
487 assert(c->current_const[srcRegIndex].index != -1);
488 const_reg = c->current_const[srcRegIndex].reg;
489
490 /* extract desired float from the const_reg, and smear */
491 const_reg = stride(const_reg, 0, 1, 0);
492 const_reg.subnr = component * 4;
493
494 if (src->Negate & (1 << component))
495 const_reg = negate(const_reg);
496 if (src->Abs)
497 const_reg = brw_abs(const_reg);
498
499 #if 0
500 printf(" form const[%d].%d for arg %d, reg %d\n",
501 c->current_const[srcRegIndex].index,
502 component,
503 srcRegIndex,
504 const_reg.nr);
505 #endif
506
507 return const_reg;
508 }
509
510
511 /**
512 * Convert Mesa src register to brw register.
513 */
514 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
515 const struct prog_instruction *inst,
516 GLuint srcRegIndex, GLuint channel)
517 {
518 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
519 const GLuint nr = 1;
520 const GLuint component = GET_SWZ(src->Swizzle, channel);
521
522 /* Extended swizzle terms */
523 if (component == SWIZZLE_ZERO) {
524 return brw_imm_f(0.0F);
525 }
526 else if (component == SWIZZLE_ONE) {
527 return brw_imm_f(1.0F);
528 }
529
530 if (c->fp->use_const_buffer &&
531 (src->File == PROGRAM_STATE_VAR ||
532 src->File == PROGRAM_CONSTANT ||
533 src->File == PROGRAM_UNIFORM)) {
534 return get_src_reg_const(c, inst, srcRegIndex, component);
535 }
536 else {
537 /* other type of source register */
538 return get_reg(c, src->File, src->Index, component, nr,
539 src->Negate, src->Abs);
540 }
541 }
542
543
544 /**
545 * Same as \sa get_src_reg() but if the register is a literal, emit
546 * a brw_reg encoding the literal.
547 * Note that a brw instruction only allows one src operand to be a literal.
548 * For instructions with more than one operand, only the second can be a
549 * literal. This means that we treat some literals as constants/uniforms
550 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
551 *
552 */
553 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
554 const struct prog_instruction *inst,
555 GLuint srcRegIndex, GLuint channel)
556 {
557 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
558 if (src->File == PROGRAM_CONSTANT) {
559 /* a literal */
560 const int component = GET_SWZ(src->Swizzle, channel);
561 const GLfloat *param =
562 c->fp->program.Base.Parameters->ParameterValues[src->Index];
563 GLfloat value = param[component];
564 if (src->Negate & (1 << channel))
565 value = -value;
566 if (src->Abs)
567 value = FABSF(value);
568 #if 0
569 printf(" form immed value %f for chan %d\n", value, channel);
570 #endif
571 return brw_imm_f(value);
572 }
573 else {
574 return get_src_reg(c, inst, srcRegIndex, channel);
575 }
576 }
577
578
579 /**
580 * Subroutines are minimal support for resusable instruction sequences.
581 * They are implemented as simply as possible to minimise overhead: there
582 * is no explicit support for communication between the caller and callee
583 * other than saving the return address in a temporary register, nor is
584 * there any automatic local storage. This implies that great care is
585 * required before attempting reentrancy or any kind of nested
586 * subroutine invocations.
587 */
588 static void invoke_subroutine( struct brw_wm_compile *c,
589 enum _subroutine subroutine,
590 void (*emit)( struct brw_wm_compile * ) )
591 {
592 struct brw_compile *p = &c->func;
593
594 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
595
596 if( c->subroutines[ subroutine ] ) {
597 /* subroutine previously emitted: reuse existing instructions */
598
599 int mark = mark_tmps( c );
600 struct brw_reg return_address = retype( alloc_tmp( c ),
601 BRW_REGISTER_TYPE_UD );
602 int here = p->nr_insn;
603
604 brw_push_insn_state(p);
605 brw_set_mask_control(p, BRW_MASK_DISABLE);
606 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
607
608 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
609 brw_imm_d( ( c->subroutines[ subroutine ] -
610 here - 1 ) << 4 ) );
611 brw_pop_insn_state(p);
612
613 release_tmps( c, mark );
614 } else {
615 /* previously unused subroutine: emit, and mark for later reuse */
616
617 int mark = mark_tmps( c );
618 struct brw_reg return_address = retype( alloc_tmp( c ),
619 BRW_REGISTER_TYPE_UD );
620 struct brw_instruction *calc;
621 int base = p->nr_insn;
622
623 brw_push_insn_state(p);
624 brw_set_mask_control(p, BRW_MASK_DISABLE);
625 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
626 brw_pop_insn_state(p);
627
628 c->subroutines[ subroutine ] = p->nr_insn;
629
630 emit( c );
631
632 brw_push_insn_state(p);
633 brw_set_mask_control(p, BRW_MASK_DISABLE);
634 brw_MOV( p, brw_ip_reg(), return_address );
635 brw_pop_insn_state(p);
636
637 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
638
639 release_tmps( c, mark );
640 }
641 }
642
643 static void emit_abs( struct brw_wm_compile *c,
644 const struct prog_instruction *inst)
645 {
646 int i;
647 struct brw_compile *p = &c->func;
648 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
649 for (i = 0; i < 4; i++) {
650 if (inst->DstReg.WriteMask & (1<<i)) {
651 struct brw_reg src, dst;
652 dst = get_dst_reg(c, inst, i);
653 src = get_src_reg(c, inst, 0, i);
654 brw_MOV(p, dst, brw_abs(src));
655 }
656 }
657 brw_set_saturate(p, 0);
658 }
659
660 static void emit_trunc( struct brw_wm_compile *c,
661 const struct prog_instruction *inst)
662 {
663 int i;
664 struct brw_compile *p = &c->func;
665 GLuint mask = inst->DstReg.WriteMask;
666 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
667 for (i = 0; i < 4; i++) {
668 if (mask & (1<<i)) {
669 struct brw_reg src, dst;
670 dst = get_dst_reg(c, inst, i);
671 src = get_src_reg(c, inst, 0, i);
672 brw_RNDZ(p, dst, src);
673 }
674 }
675 brw_set_saturate(p, 0);
676 }
677
678 static void emit_mov( struct brw_wm_compile *c,
679 const struct prog_instruction *inst)
680 {
681 int i;
682 struct brw_compile *p = &c->func;
683 GLuint mask = inst->DstReg.WriteMask;
684 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
685 for (i = 0; i < 4; i++) {
686 if (mask & (1<<i)) {
687 struct brw_reg src, dst;
688 dst = get_dst_reg(c, inst, i);
689 /* XXX some moves from immediate value don't work reliably!!! */
690 /*src = get_src_reg_imm(c, inst, 0, i);*/
691 src = get_src_reg(c, inst, 0, i);
692 brw_MOV(p, dst, src);
693 }
694 }
695 brw_set_saturate(p, 0);
696 }
697
698 static void emit_pixel_xy(struct brw_wm_compile *c,
699 const struct prog_instruction *inst)
700 {
701 struct brw_reg r1 = brw_vec1_grf(1, 0);
702 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
703
704 struct brw_reg dst0, dst1;
705 struct brw_compile *p = &c->func;
706 GLuint mask = inst->DstReg.WriteMask;
707
708 dst0 = get_dst_reg(c, inst, 0);
709 dst1 = get_dst_reg(c, inst, 1);
710 /* Calculate pixel centers by adding 1 or 0 to each of the
711 * micro-tile coordinates passed in r1.
712 */
713 if (mask & WRITEMASK_X) {
714 brw_ADD(p,
715 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
716 stride(suboffset(r1_uw, 4), 2, 4, 0),
717 brw_imm_v(0x10101010));
718 }
719
720 if (mask & WRITEMASK_Y) {
721 brw_ADD(p,
722 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
723 stride(suboffset(r1_uw, 5), 2, 4, 0),
724 brw_imm_v(0x11001100));
725 }
726 }
727
728 static void emit_delta_xy(struct brw_wm_compile *c,
729 const struct prog_instruction *inst)
730 {
731 struct brw_reg r1 = brw_vec1_grf(1, 0);
732 struct brw_reg dst0, dst1, src0, src1;
733 struct brw_compile *p = &c->func;
734 GLuint mask = inst->DstReg.WriteMask;
735
736 dst0 = get_dst_reg(c, inst, 0);
737 dst1 = get_dst_reg(c, inst, 1);
738 src0 = get_src_reg(c, inst, 0, 0);
739 src1 = get_src_reg(c, inst, 0, 1);
740 /* Calc delta X,Y by subtracting origin in r1 from the pixel
741 * centers.
742 */
743 if (mask & WRITEMASK_X) {
744 brw_ADD(p,
745 dst0,
746 retype(src0, BRW_REGISTER_TYPE_UW),
747 negate(r1));
748 }
749
750 if (mask & WRITEMASK_Y) {
751 brw_ADD(p,
752 dst1,
753 retype(src1, BRW_REGISTER_TYPE_UW),
754 negate(suboffset(r1,1)));
755
756 }
757 }
758
759 static void fire_fb_write( struct brw_wm_compile *c,
760 GLuint base_reg,
761 GLuint nr,
762 GLuint target,
763 GLuint eot)
764 {
765 struct brw_compile *p = &c->func;
766 /* Pass through control information:
767 */
768 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
769 {
770 brw_push_insn_state(p);
771 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
772 brw_MOV(p,
773 brw_message_reg(base_reg + 1),
774 brw_vec8_grf(1, 0));
775 brw_pop_insn_state(p);
776 }
777 /* Send framebuffer write message: */
778 brw_fb_WRITE(p,
779 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
780 base_reg,
781 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
782 target,
783 nr,
784 0,
785 eot);
786 }
787
788 static void emit_fb_write(struct brw_wm_compile *c,
789 const struct prog_instruction *inst)
790 {
791 struct brw_compile *p = &c->func;
792 int nr = 2;
793 int channel;
794 GLuint target, eot;
795 struct brw_reg src0;
796
797 /* Reserve a space for AA - may not be needed:
798 */
799 if (c->key.aa_dest_stencil_reg)
800 nr += 1;
801
802 brw_push_insn_state(p);
803 for (channel = 0; channel < 4; channel++) {
804 src0 = get_src_reg(c, inst, 0, channel);
805 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
806 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
807 brw_MOV(p, brw_message_reg(nr + channel), src0);
808 }
809 /* skip over the regs populated above: */
810 nr += 8;
811 brw_pop_insn_state(p);
812
813 if (c->key.source_depth_to_render_target) {
814 if (c->key.computes_depth) {
815 src0 = get_src_reg(c, inst, 2, 2);
816 brw_MOV(p, brw_message_reg(nr), src0);
817 }
818 else {
819 src0 = get_src_reg(c, inst, 1, 1);
820 brw_MOV(p, brw_message_reg(nr), src0);
821 }
822
823 nr += 2;
824 }
825
826 if (c->key.dest_depth_reg) {
827 const GLuint comp = c->key.dest_depth_reg / 2;
828 const GLuint off = c->key.dest_depth_reg % 2;
829
830 if (off != 0) {
831 /* XXX this code needs review/testing */
832 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
833 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
834
835 brw_push_insn_state(p);
836 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
837
838 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
839 /* 2nd half? */
840 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
841 brw_pop_insn_state(p);
842 }
843 else
844 {
845 struct brw_reg src = get_src_reg(c, inst, 1, 1);
846 brw_MOV(p, brw_message_reg(nr), src);
847 }
848 nr += 2;
849 }
850
851 target = inst->Aux >> 1;
852 eot = inst->Aux & 1;
853 fire_fb_write(c, 0, nr, target, eot);
854 }
855
856 static void emit_pixel_w( struct brw_wm_compile *c,
857 const struct prog_instruction *inst)
858 {
859 struct brw_compile *p = &c->func;
860 GLuint mask = inst->DstReg.WriteMask;
861 if (mask & WRITEMASK_W) {
862 struct brw_reg dst, src0, delta0, delta1;
863 struct brw_reg interp3;
864
865 dst = get_dst_reg(c, inst, 3);
866 src0 = get_src_reg(c, inst, 0, 0);
867 delta0 = get_src_reg(c, inst, 1, 0);
868 delta1 = get_src_reg(c, inst, 1, 1);
869
870 interp3 = brw_vec1_grf(src0.nr+1, 4);
871 /* Calc 1/w - just linterp wpos[3] optimized by putting the
872 * result straight into a message reg.
873 */
874 brw_LINE(p, brw_null_reg(), interp3, delta0);
875 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
876
877 /* Calc w */
878 brw_math_16( p, dst,
879 BRW_MATH_FUNCTION_INV,
880 BRW_MATH_SATURATE_NONE,
881 2, brw_null_reg(),
882 BRW_MATH_PRECISION_FULL);
883 }
884 }
885
886 static void emit_linterp(struct brw_wm_compile *c,
887 const struct prog_instruction *inst)
888 {
889 struct brw_compile *p = &c->func;
890 GLuint mask = inst->DstReg.WriteMask;
891 struct brw_reg interp[4];
892 struct brw_reg dst, delta0, delta1;
893 struct brw_reg src0;
894 GLuint nr, i;
895
896 src0 = get_src_reg(c, inst, 0, 0);
897 delta0 = get_src_reg(c, inst, 1, 0);
898 delta1 = get_src_reg(c, inst, 1, 1);
899 nr = src0.nr;
900
901 interp[0] = brw_vec1_grf(nr, 0);
902 interp[1] = brw_vec1_grf(nr, 4);
903 interp[2] = brw_vec1_grf(nr+1, 0);
904 interp[3] = brw_vec1_grf(nr+1, 4);
905
906 for(i = 0; i < 4; i++ ) {
907 if (mask & (1<<i)) {
908 dst = get_dst_reg(c, inst, i);
909 brw_LINE(p, brw_null_reg(), interp[i], delta0);
910 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
911 }
912 }
913 }
914
915 static void emit_cinterp(struct brw_wm_compile *c,
916 const struct prog_instruction *inst)
917 {
918 struct brw_compile *p = &c->func;
919 GLuint mask = inst->DstReg.WriteMask;
920
921 struct brw_reg interp[4];
922 struct brw_reg dst, src0;
923 GLuint nr, i;
924
925 src0 = get_src_reg(c, inst, 0, 0);
926 nr = src0.nr;
927
928 interp[0] = brw_vec1_grf(nr, 0);
929 interp[1] = brw_vec1_grf(nr, 4);
930 interp[2] = brw_vec1_grf(nr+1, 0);
931 interp[3] = brw_vec1_grf(nr+1, 4);
932
933 for(i = 0; i < 4; i++ ) {
934 if (mask & (1<<i)) {
935 dst = get_dst_reg(c, inst, i);
936 brw_MOV(p, dst, suboffset(interp[i],3));
937 }
938 }
939 }
940
941 static void emit_pinterp(struct brw_wm_compile *c,
942 const struct prog_instruction *inst)
943 {
944 struct brw_compile *p = &c->func;
945 GLuint mask = inst->DstReg.WriteMask;
946
947 struct brw_reg interp[4];
948 struct brw_reg dst, delta0, delta1;
949 struct brw_reg src0, w;
950 GLuint nr, i;
951
952 src0 = get_src_reg(c, inst, 0, 0);
953 delta0 = get_src_reg(c, inst, 1, 0);
954 delta1 = get_src_reg(c, inst, 1, 1);
955 w = get_src_reg(c, inst, 2, 3);
956 nr = src0.nr;
957
958 interp[0] = brw_vec1_grf(nr, 0);
959 interp[1] = brw_vec1_grf(nr, 4);
960 interp[2] = brw_vec1_grf(nr+1, 0);
961 interp[3] = brw_vec1_grf(nr+1, 4);
962
963 for(i = 0; i < 4; i++ ) {
964 if (mask & (1<<i)) {
965 dst = get_dst_reg(c, inst, i);
966 brw_LINE(p, brw_null_reg(), interp[i], delta0);
967 brw_MAC(p, dst, suboffset(interp[i],1),
968 delta1);
969 brw_MUL(p, dst, dst, w);
970 }
971 }
972 }
973
974 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
975 static void emit_frontfacing(struct brw_wm_compile *c,
976 const struct prog_instruction *inst)
977 {
978 struct brw_compile *p = &c->func;
979 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
980 struct brw_reg dst;
981 GLuint mask = inst->DstReg.WriteMask;
982 int i;
983
984 for (i = 0; i < 4; i++) {
985 if (mask & (1<<i)) {
986 dst = get_dst_reg(c, inst, i);
987 brw_MOV(p, dst, brw_imm_f(0.0));
988 }
989 }
990
991 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
992 * us front face
993 */
994 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
995 for (i = 0; i < 4; i++) {
996 if (mask & (1<<i)) {
997 dst = get_dst_reg(c, inst, i);
998 brw_MOV(p, dst, brw_imm_f(1.0));
999 }
1000 }
1001 brw_set_predicate_control_flag_value(p, 0xff);
1002 }
1003
1004 static void emit_xpd(struct brw_wm_compile *c,
1005 const struct prog_instruction *inst)
1006 {
1007 int i;
1008 struct brw_compile *p = &c->func;
1009 GLuint mask = inst->DstReg.WriteMask;
1010 for (i = 0; i < 4; i++) {
1011 GLuint i2 = (i+2)%3;
1012 GLuint i1 = (i+1)%3;
1013 if (mask & (1<<i)) {
1014 struct brw_reg src0, src1, dst;
1015 dst = get_dst_reg(c, inst, i);
1016 src0 = negate(get_src_reg(c, inst, 0, i2));
1017 src1 = get_src_reg_imm(c, inst, 1, i1);
1018 brw_MUL(p, brw_null_reg(), src0, src1);
1019 src0 = get_src_reg(c, inst, 0, i1);
1020 src1 = get_src_reg_imm(c, inst, 1, i2);
1021 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1022 brw_MAC(p, dst, src0, src1);
1023 brw_set_saturate(p, 0);
1024 }
1025 }
1026 brw_set_saturate(p, 0);
1027 }
1028
1029 static void emit_dp3(struct brw_wm_compile *c,
1030 const struct prog_instruction *inst)
1031 {
1032 struct brw_reg src0[3], src1[3], dst;
1033 int i;
1034 struct brw_compile *p = &c->func;
1035 for (i = 0; i < 3; i++) {
1036 src0[i] = get_src_reg(c, inst, 0, i);
1037 src1[i] = get_src_reg_imm(c, inst, 1, i);
1038 }
1039
1040 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1041 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1042 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1043 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1044 brw_MAC(p, dst, src0[2], src1[2]);
1045 brw_set_saturate(p, 0);
1046 }
1047
1048 static void emit_dp4(struct brw_wm_compile *c,
1049 const struct prog_instruction *inst)
1050 {
1051 struct brw_reg src0[4], src1[4], dst;
1052 int i;
1053 struct brw_compile *p = &c->func;
1054 for (i = 0; i < 4; i++) {
1055 src0[i] = get_src_reg(c, inst, 0, i);
1056 src1[i] = get_src_reg_imm(c, inst, 1, i);
1057 }
1058 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1059 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1060 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1061 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1062 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1063 brw_MAC(p, dst, src0[3], src1[3]);
1064 brw_set_saturate(p, 0);
1065 }
1066
1067 static void emit_dph(struct brw_wm_compile *c,
1068 const struct prog_instruction *inst)
1069 {
1070 struct brw_reg src0[4], src1[4], dst;
1071 int i;
1072 struct brw_compile *p = &c->func;
1073 for (i = 0; i < 4; i++) {
1074 src0[i] = get_src_reg(c, inst, 0, i);
1075 src1[i] = get_src_reg_imm(c, inst, 1, i);
1076 }
1077 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1078 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1079 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1080 brw_MAC(p, dst, src0[2], src1[2]);
1081 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1082 brw_ADD(p, dst, dst, src1[3]);
1083 brw_set_saturate(p, 0);
1084 }
1085
1086 /**
1087 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1088 * Note that the result of the function is smeared across the dest
1089 * register's X, Y, Z and W channels (subject to writemasking of course).
1090 */
1091 static void emit_math1(struct brw_wm_compile *c,
1092 const struct prog_instruction *inst, GLuint func)
1093 {
1094 struct brw_compile *p = &c->func;
1095 struct brw_reg src0, dst, tmp;
1096 const int mark = mark_tmps( c );
1097 int i;
1098
1099 tmp = alloc_tmp(c);
1100
1101 /* Get first component of source register */
1102 src0 = get_src_reg(c, inst, 0, 0);
1103
1104 /* tmp = func(src0) */
1105 brw_MOV(p, brw_message_reg(2), src0);
1106 brw_math(p,
1107 tmp,
1108 func,
1109 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1110 2,
1111 brw_null_reg(),
1112 BRW_MATH_DATA_VECTOR,
1113 BRW_MATH_PRECISION_FULL);
1114
1115 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1116
1117 /* replicate tmp value across enabled dest channels */
1118 for (i = 0; i < 4; i++) {
1119 if (inst->DstReg.WriteMask & (1 << i)) {
1120 dst = get_dst_reg(c, inst, i);
1121 brw_MOV(p, dst, tmp);
1122 }
1123 }
1124
1125 release_tmps(c, mark);
1126 }
1127
1128 static void emit_rcp(struct brw_wm_compile *c,
1129 const struct prog_instruction *inst)
1130 {
1131 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1132 }
1133
1134 static void emit_rsq(struct brw_wm_compile *c,
1135 const struct prog_instruction *inst)
1136 {
1137 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1138 }
1139
1140 static void emit_sin(struct brw_wm_compile *c,
1141 const struct prog_instruction *inst)
1142 {
1143 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1144 }
1145
1146 static void emit_cos(struct brw_wm_compile *c,
1147 const struct prog_instruction *inst)
1148 {
1149 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1150 }
1151
1152 static void emit_ex2(struct brw_wm_compile *c,
1153 const struct prog_instruction *inst)
1154 {
1155 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1156 }
1157
1158 static void emit_lg2(struct brw_wm_compile *c,
1159 const struct prog_instruction *inst)
1160 {
1161 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1162 }
1163
1164 static void emit_add(struct brw_wm_compile *c,
1165 const struct prog_instruction *inst)
1166 {
1167 struct brw_compile *p = &c->func;
1168 struct brw_reg src0, src1, dst;
1169 GLuint mask = inst->DstReg.WriteMask;
1170 int i;
1171 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1172 for (i = 0 ; i < 4; i++) {
1173 if (mask & (1<<i)) {
1174 dst = get_dst_reg(c, inst, i);
1175 src0 = get_src_reg(c, inst, 0, i);
1176 src1 = get_src_reg_imm(c, inst, 1, i);
1177 brw_ADD(p, dst, src0, src1);
1178 }
1179 }
1180 brw_set_saturate(p, 0);
1181 }
1182
1183 static void emit_arl(struct brw_wm_compile *c,
1184 const struct prog_instruction *inst)
1185 {
1186 struct brw_compile *p = &c->func;
1187 struct brw_reg src0, addr_reg;
1188 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1189 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1190 BRW_ARF_ADDRESS, 0);
1191 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1192 brw_MOV(p, addr_reg, src0);
1193 brw_set_saturate(p, 0);
1194 }
1195
1196 static void emit_sub(struct brw_wm_compile *c,
1197 const struct prog_instruction *inst)
1198 {
1199 struct brw_compile *p = &c->func;
1200 struct brw_reg src0, src1, dst;
1201 GLuint mask = inst->DstReg.WriteMask;
1202 int i;
1203 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1204 for (i = 0 ; i < 4; i++) {
1205 if (mask & (1<<i)) {
1206 dst = get_dst_reg(c, inst, i);
1207 src0 = get_src_reg(c, inst, 0, i);
1208 src1 = get_src_reg_imm(c, inst, 1, i);
1209 brw_ADD(p, dst, src0, negate(src1));
1210 }
1211 }
1212 brw_set_saturate(p, 0);
1213 }
1214
1215 static void emit_mul(struct brw_wm_compile *c,
1216 const struct prog_instruction *inst)
1217 {
1218 struct brw_compile *p = &c->func;
1219 struct brw_reg src0, src1, dst;
1220 GLuint mask = inst->DstReg.WriteMask;
1221 int i;
1222 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1223 for (i = 0 ; i < 4; i++) {
1224 if (mask & (1<<i)) {
1225 dst = get_dst_reg(c, inst, i);
1226 src0 = get_src_reg(c, inst, 0, i);
1227 src1 = get_src_reg_imm(c, inst, 1, i);
1228 brw_MUL(p, dst, src0, src1);
1229 }
1230 }
1231 brw_set_saturate(p, 0);
1232 }
1233
1234 static void emit_frc(struct brw_wm_compile *c,
1235 const struct prog_instruction *inst)
1236 {
1237 struct brw_compile *p = &c->func;
1238 struct brw_reg src0, dst;
1239 GLuint mask = inst->DstReg.WriteMask;
1240 int i;
1241 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1242 for (i = 0 ; i < 4; i++) {
1243 if (mask & (1<<i)) {
1244 dst = get_dst_reg(c, inst, i);
1245 src0 = get_src_reg_imm(c, inst, 0, i);
1246 brw_FRC(p, dst, src0);
1247 }
1248 }
1249 if (inst->SaturateMode != SATURATE_OFF)
1250 brw_set_saturate(p, 0);
1251 }
1252
1253 static void emit_flr(struct brw_wm_compile *c,
1254 const struct prog_instruction *inst)
1255 {
1256 struct brw_compile *p = &c->func;
1257 struct brw_reg src0, dst;
1258 GLuint mask = inst->DstReg.WriteMask;
1259 int i;
1260 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1261 for (i = 0 ; i < 4; i++) {
1262 if (mask & (1<<i)) {
1263 dst = get_dst_reg(c, inst, i);
1264 src0 = get_src_reg_imm(c, inst, 0, i);
1265 brw_RNDD(p, dst, src0);
1266 }
1267 }
1268 brw_set_saturate(p, 0);
1269 }
1270
1271
1272 static void emit_min_max(struct brw_wm_compile *c,
1273 const struct prog_instruction *inst)
1274 {
1275 struct brw_compile *p = &c->func;
1276 const GLuint mask = inst->DstReg.WriteMask;
1277 const int mark = mark_tmps(c);
1278 int i;
1279 brw_push_insn_state(p);
1280 for (i = 0; i < 4; i++) {
1281 if (mask & (1<<i)) {
1282 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1283 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1284 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1285 struct brw_reg dst;
1286 /* if dst==src0 or dst==src1 we need to use a temp reg */
1287 GLboolean use_temp = brw_same_reg(dst, src0) ||
1288 brw_same_reg(dst, src1);
1289 if (use_temp)
1290 dst = alloc_tmp(c);
1291 else
1292 dst = real_dst;
1293
1294 /*
1295 printf(" Min/max: dst %d src0 %d src1 %d\n",
1296 dst.nr, src0.nr, src1.nr);
1297 */
1298 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1299 brw_MOV(p, dst, src0);
1300 brw_set_saturate(p, 0);
1301
1302 if (inst->Opcode == OPCODE_MIN)
1303 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1304 else
1305 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1306
1307 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1308 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1309 brw_MOV(p, dst, src1);
1310 brw_set_saturate(p, 0);
1311 brw_set_predicate_control_flag_value(p, 0xff);
1312 if (use_temp)
1313 brw_MOV(p, real_dst, dst);
1314 }
1315 }
1316 brw_pop_insn_state(p);
1317 release_tmps(c, mark);
1318 }
1319
1320 static void emit_pow(struct brw_wm_compile *c,
1321 const struct prog_instruction *inst)
1322 {
1323 struct brw_compile *p = &c->func;
1324 struct brw_reg dst, src0, src1;
1325 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1326 src0 = get_src_reg_imm(c, inst, 0, 0);
1327 src1 = get_src_reg_imm(c, inst, 1, 0);
1328
1329 brw_MOV(p, brw_message_reg(2), src0);
1330 brw_MOV(p, brw_message_reg(3), src1);
1331
1332 brw_math(p,
1333 dst,
1334 BRW_MATH_FUNCTION_POW,
1335 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1336 2,
1337 brw_null_reg(),
1338 BRW_MATH_DATA_VECTOR,
1339 BRW_MATH_PRECISION_FULL);
1340 }
1341
1342 static void emit_lrp(struct brw_wm_compile *c,
1343 const struct prog_instruction *inst)
1344 {
1345 struct brw_compile *p = &c->func;
1346 GLuint mask = inst->DstReg.WriteMask;
1347 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1348 int i;
1349 int mark = mark_tmps(c);
1350 for (i = 0; i < 4; i++) {
1351 if (mask & (1<<i)) {
1352 dst = get_dst_reg(c, inst, i);
1353 src0 = get_src_reg(c, inst, 0, i);
1354
1355 src1 = get_src_reg_imm(c, inst, 1, i);
1356
1357 if (src1.nr == dst.nr) {
1358 tmp1 = alloc_tmp(c);
1359 brw_MOV(p, tmp1, src1);
1360 } else
1361 tmp1 = src1;
1362
1363 src2 = get_src_reg(c, inst, 2, i);
1364 if (src2.nr == dst.nr) {
1365 tmp2 = alloc_tmp(c);
1366 brw_MOV(p, tmp2, src2);
1367 } else
1368 tmp2 = src2;
1369
1370 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1371 brw_MUL(p, brw_null_reg(), dst, tmp2);
1372 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1373 brw_MAC(p, dst, src0, tmp1);
1374 brw_set_saturate(p, 0);
1375 }
1376 release_tmps(c, mark);
1377 }
1378 }
1379
1380 /**
1381 * For GLSL shaders, this KIL will be unconditional.
1382 * It may be contained inside an IF/ENDIF structure of course.
1383 */
1384 static void emit_kil(struct brw_wm_compile *c)
1385 {
1386 struct brw_compile *p = &c->func;
1387 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1388 brw_push_insn_state(p);
1389 brw_set_mask_control(p, BRW_MASK_DISABLE);
1390 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1391 brw_AND(p, depth, c->emit_mask_reg, depth);
1392 brw_pop_insn_state(p);
1393 }
1394
1395 static void emit_mad(struct brw_wm_compile *c,
1396 const struct prog_instruction *inst)
1397 {
1398 struct brw_compile *p = &c->func;
1399 GLuint mask = inst->DstReg.WriteMask;
1400 struct brw_reg dst, src0, src1, src2;
1401 int i;
1402
1403 for (i = 0; i < 4; i++) {
1404 if (mask & (1<<i)) {
1405 dst = get_dst_reg(c, inst, i);
1406 src0 = get_src_reg(c, inst, 0, i);
1407 src1 = get_src_reg_imm(c, inst, 1, i);
1408 src2 = get_src_reg_imm(c, inst, 2, i);
1409 brw_MUL(p, dst, src0, src1);
1410
1411 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1412 brw_ADD(p, dst, dst, src2);
1413 brw_set_saturate(p, 0);
1414 }
1415 }
1416 }
1417
1418 static void emit_sop(struct brw_wm_compile *c,
1419 const struct prog_instruction *inst, GLuint cond)
1420 {
1421 struct brw_compile *p = &c->func;
1422 GLuint mask = inst->DstReg.WriteMask;
1423 struct brw_reg dst, src0, src1;
1424 int i;
1425
1426 for (i = 0; i < 4; i++) {
1427 if (mask & (1<<i)) {
1428 dst = get_dst_reg(c, inst, i);
1429 src0 = get_src_reg(c, inst, 0, i);
1430 src1 = get_src_reg_imm(c, inst, 1, i);
1431 brw_push_insn_state(p);
1432 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1433 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1434 brw_MOV(p, dst, brw_imm_f(0.0));
1435 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1436 brw_MOV(p, dst, brw_imm_f(1.0));
1437 brw_pop_insn_state(p);
1438 }
1439 }
1440 }
1441
1442 static void emit_slt(struct brw_wm_compile *c,
1443 const struct prog_instruction *inst)
1444 {
1445 emit_sop(c, inst, BRW_CONDITIONAL_L);
1446 }
1447
1448 static void emit_sle(struct brw_wm_compile *c,
1449 const struct prog_instruction *inst)
1450 {
1451 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1452 }
1453
1454 static void emit_sgt(struct brw_wm_compile *c,
1455 const struct prog_instruction *inst)
1456 {
1457 emit_sop(c, inst, BRW_CONDITIONAL_G);
1458 }
1459
1460 static void emit_sge(struct brw_wm_compile *c,
1461 const struct prog_instruction *inst)
1462 {
1463 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1464 }
1465
1466 static void emit_seq(struct brw_wm_compile *c,
1467 const struct prog_instruction *inst)
1468 {
1469 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1470 }
1471
1472 static void emit_sne(struct brw_wm_compile *c,
1473 const struct prog_instruction *inst)
1474 {
1475 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1476 }
1477
1478 static void emit_ddx(struct brw_wm_compile *c,
1479 const struct prog_instruction *inst)
1480 {
1481 struct brw_compile *p = &c->func;
1482 GLuint mask = inst->DstReg.WriteMask;
1483 struct brw_reg interp[4];
1484 struct brw_reg dst;
1485 struct brw_reg src0, w;
1486 GLuint nr, i;
1487 src0 = get_src_reg(c, inst, 0, 0);
1488 w = get_src_reg(c, inst, 1, 3);
1489 nr = src0.nr;
1490 interp[0] = brw_vec1_grf(nr, 0);
1491 interp[1] = brw_vec1_grf(nr, 4);
1492 interp[2] = brw_vec1_grf(nr+1, 0);
1493 interp[3] = brw_vec1_grf(nr+1, 4);
1494 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1495 for(i = 0; i < 4; i++ ) {
1496 if (mask & (1<<i)) {
1497 dst = get_dst_reg(c, inst, i);
1498 brw_MOV(p, dst, interp[i]);
1499 brw_MUL(p, dst, dst, w);
1500 }
1501 }
1502 brw_set_saturate(p, 0);
1503 }
1504
1505 static void emit_ddy(struct brw_wm_compile *c,
1506 const struct prog_instruction *inst)
1507 {
1508 struct brw_compile *p = &c->func;
1509 GLuint mask = inst->DstReg.WriteMask;
1510 struct brw_reg interp[4];
1511 struct brw_reg dst;
1512 struct brw_reg src0, w;
1513 GLuint nr, i;
1514
1515 src0 = get_src_reg(c, inst, 0, 0);
1516 nr = src0.nr;
1517 w = get_src_reg(c, inst, 1, 3);
1518 interp[0] = brw_vec1_grf(nr, 0);
1519 interp[1] = brw_vec1_grf(nr, 4);
1520 interp[2] = brw_vec1_grf(nr+1, 0);
1521 interp[3] = brw_vec1_grf(nr+1, 4);
1522 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1523 for(i = 0; i < 4; i++ ) {
1524 if (mask & (1<<i)) {
1525 dst = get_dst_reg(c, inst, i);
1526 brw_MOV(p, dst, suboffset(interp[i], 1));
1527 brw_MUL(p, dst, dst, w);
1528 }
1529 }
1530 brw_set_saturate(p, 0);
1531 }
1532
1533 static INLINE struct brw_reg high_words( struct brw_reg reg )
1534 {
1535 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1536 0, 8, 2 );
1537 }
1538
1539 static INLINE struct brw_reg low_words( struct brw_reg reg )
1540 {
1541 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1542 }
1543
1544 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1545 {
1546 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1547 }
1548
1549 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1550 {
1551 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1552 0, 16, 2 );
1553 }
1554
1555 /* One-, two- and three-dimensional Perlin noise, similar to the description
1556 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1557 static void noise1_sub( struct brw_wm_compile *c ) {
1558
1559 struct brw_compile *p = &c->func;
1560 struct brw_reg param,
1561 x0, x1, /* gradients at each end */
1562 t, tmp[ 2 ], /* float temporaries */
1563 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1564 int i;
1565 int mark = mark_tmps( c );
1566
1567 x0 = alloc_tmp( c );
1568 x1 = alloc_tmp( c );
1569 t = alloc_tmp( c );
1570 tmp[ 0 ] = alloc_tmp( c );
1571 tmp[ 1 ] = alloc_tmp( c );
1572 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1573 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1574 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1575 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1576 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1577
1578 param = lookup_tmp( c, mark - 2 );
1579
1580 brw_set_access_mode( p, BRW_ALIGN_1 );
1581
1582 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1583
1584 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1585 be hashed. Also compute the remainder (offset within the unit
1586 length), interleaved to reduce register dependency penalties. */
1587 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1588 brw_FRC( p, param, param );
1589 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1590 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1591 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1592
1593 /* We're now ready to perform the hashing. The two hashes are
1594 interleaved for performance. The hash function used is
1595 designed to rapidly achieve avalanche and require only 32x16
1596 bit multiplication, and 16-bit swizzles (which we get for
1597 free). We can't use immediate operands in the multiplies,
1598 because immediates are permitted only in src1 and the 16-bit
1599 factor is permitted only in src0. */
1600 for( i = 0; i < 2; i++ )
1601 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1602 for( i = 0; i < 2; i++ )
1603 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1604 high_words( itmp[ i ] ) );
1605 for( i = 0; i < 2; i++ )
1606 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1607 for( i = 0; i < 2; i++ )
1608 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1609 high_words( itmp[ i ] ) );
1610 for( i = 0; i < 2; i++ )
1611 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1612 for( i = 0; i < 2; i++ )
1613 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1614 high_words( itmp[ i ] ) );
1615
1616 /* Now we want to initialise the two gradients based on the
1617 hashes. Format conversion from signed integer to float leaves
1618 everything scaled too high by a factor of pow( 2, 31 ), but
1619 we correct for that right at the end. */
1620 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1621 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1622 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1623
1624 brw_MUL( p, x0, x0, param );
1625 brw_MUL( p, x1, x1, t );
1626
1627 /* We interpolate between the gradients using the polynomial
1628 6t^5 - 15t^4 + 10t^3 (Perlin). */
1629 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1630 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1631 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1632 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1633 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1634 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1635 pipeline */
1636 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1637 brw_MUL( p, param, tmp[ 0 ], param );
1638 brw_MUL( p, x1, x1, param );
1639 brw_ADD( p, x0, x0, x1 );
1640 /* scale by pow( 2, -30 ), to compensate for the format conversion
1641 above and an extra factor of 2 so that a single gradient covers
1642 the [-1,1] range */
1643 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1644
1645 release_tmps( c, mark );
1646 }
1647
1648 static void emit_noise1( struct brw_wm_compile *c,
1649 const struct prog_instruction *inst )
1650 {
1651 struct brw_compile *p = &c->func;
1652 struct brw_reg src, param, dst;
1653 GLuint mask = inst->DstReg.WriteMask;
1654 int i;
1655 int mark = mark_tmps( c );
1656
1657 assert( mark == 0 );
1658
1659 src = get_src_reg( c, inst, 0, 0 );
1660
1661 param = alloc_tmp( c );
1662
1663 brw_MOV( p, param, src );
1664
1665 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1666
1667 /* Fill in the result: */
1668 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1669 for (i = 0 ; i < 4; i++) {
1670 if (mask & (1<<i)) {
1671 dst = get_dst_reg(c, inst, i);
1672 brw_MOV( p, dst, param );
1673 }
1674 }
1675 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1676 brw_set_saturate( p, 0 );
1677
1678 release_tmps( c, mark );
1679 }
1680
1681 static void noise2_sub( struct brw_wm_compile *c ) {
1682
1683 struct brw_compile *p = &c->func;
1684 struct brw_reg param0, param1,
1685 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1686 t, tmp[ 4 ], /* float temporaries */
1687 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1688 int i;
1689 int mark = mark_tmps( c );
1690
1691 x0y0 = alloc_tmp( c );
1692 x0y1 = alloc_tmp( c );
1693 x1y0 = alloc_tmp( c );
1694 x1y1 = alloc_tmp( c );
1695 t = alloc_tmp( c );
1696 for( i = 0; i < 4; i++ ) {
1697 tmp[ i ] = alloc_tmp( c );
1698 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1699 }
1700 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1701 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1702 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1703
1704 param0 = lookup_tmp( c, mark - 3 );
1705 param1 = lookup_tmp( c, mark - 2 );
1706
1707 brw_set_access_mode( p, BRW_ALIGN_1 );
1708
1709 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1710 be hashed. Also compute the remainders (offsets within the unit
1711 square), interleaved to reduce register dependency penalties. */
1712 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1713 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1714 brw_FRC( p, param0, param0 );
1715 brw_FRC( p, param1, param1 );
1716 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1717 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1718 low_words( itmp[ 1 ] ) );
1719 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1720 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1721 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1722 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1723 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1724
1725 /* We're now ready to perform the hashing. The four hashes are
1726 interleaved for performance. The hash function used is
1727 designed to rapidly achieve avalanche and require only 32x16
1728 bit multiplication, and 16-bit swizzles (which we get for
1729 free). We can't use immediate operands in the multiplies,
1730 because immediates are permitted only in src1 and the 16-bit
1731 factor is permitted only in src0. */
1732 for( i = 0; i < 4; i++ )
1733 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1734 for( i = 0; i < 4; i++ )
1735 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1736 high_words( itmp[ i ] ) );
1737 for( i = 0; i < 4; i++ )
1738 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1739 for( i = 0; i < 4; i++ )
1740 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1741 high_words( itmp[ i ] ) );
1742 for( i = 0; i < 4; i++ )
1743 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1744 for( i = 0; i < 4; i++ )
1745 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1746 high_words( itmp[ i ] ) );
1747
1748 /* Now we want to initialise the four gradients based on the
1749 hashes. Format conversion from signed integer to float leaves
1750 everything scaled too high by a factor of pow( 2, 15 ), but
1751 we correct for that right at the end. */
1752 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1753 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1754 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1755 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1756 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1757
1758 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1759 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1760 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1761 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1762
1763 brw_MUL( p, x1y0, x1y0, t );
1764 brw_MUL( p, x1y1, x1y1, t );
1765 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1766 brw_MUL( p, x0y0, x0y0, param0 );
1767 brw_MUL( p, x0y1, x0y1, param0 );
1768
1769 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1770 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1771 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1772 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1773
1774 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1775 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1776 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1777 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1778
1779 /* We interpolate between the gradients using the polynomial
1780 6t^5 - 15t^4 + 10t^3 (Perlin). */
1781 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1782 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1783 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1784 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1785 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1786 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1787 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1788 pipeline */
1789 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1790 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1791 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1792 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1793 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1794 pipeline */
1795 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1796 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1797 brw_MUL( p, param0, tmp[ 0 ], param0 );
1798 brw_MUL( p, param1, tmp[ 1 ], param1 );
1799
1800 /* Here we interpolate in the y dimension... */
1801 brw_MUL( p, x0y1, x0y1, param1 );
1802 brw_MUL( p, x1y1, x1y1, param1 );
1803 brw_ADD( p, x0y0, x0y0, x0y1 );
1804 brw_ADD( p, x1y0, x1y0, x1y1 );
1805
1806 /* And now in x. There are horrible register dependencies here,
1807 but we have nothing else to do. */
1808 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1809 brw_MUL( p, x1y0, x1y0, param0 );
1810 brw_ADD( p, x0y0, x0y0, x1y0 );
1811
1812 /* scale by pow( 2, -15 ), as described above */
1813 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1814
1815 release_tmps( c, mark );
1816 }
1817
1818 static void emit_noise2( struct brw_wm_compile *c,
1819 const struct prog_instruction *inst )
1820 {
1821 struct brw_compile *p = &c->func;
1822 struct brw_reg src0, src1, param0, param1, dst;
1823 GLuint mask = inst->DstReg.WriteMask;
1824 int i;
1825 int mark = mark_tmps( c );
1826
1827 assert( mark == 0 );
1828
1829 src0 = get_src_reg( c, inst, 0, 0 );
1830 src1 = get_src_reg( c, inst, 0, 1 );
1831
1832 param0 = alloc_tmp( c );
1833 param1 = alloc_tmp( c );
1834
1835 brw_MOV( p, param0, src0 );
1836 brw_MOV( p, param1, src1 );
1837
1838 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1839
1840 /* Fill in the result: */
1841 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1842 for (i = 0 ; i < 4; i++) {
1843 if (mask & (1<<i)) {
1844 dst = get_dst_reg(c, inst, i);
1845 brw_MOV( p, dst, param0 );
1846 }
1847 }
1848 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1849 brw_set_saturate( p, 0 );
1850
1851 release_tmps( c, mark );
1852 }
1853
1854 /**
1855 * The three-dimensional case is much like the one- and two- versions above,
1856 * but since the number of corners is rapidly growing we now pack 16 16-bit
1857 * hashes into each register to extract more parallelism from the EUs.
1858 */
1859 static void noise3_sub( struct brw_wm_compile *c ) {
1860
1861 struct brw_compile *p = &c->func;
1862 struct brw_reg param0, param1, param2,
1863 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1864 xi, yi, zi, /* interpolation coefficients */
1865 t, tmp[ 8 ], /* float temporaries */
1866 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1867 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1868 int i;
1869 int mark = mark_tmps( c );
1870
1871 x0y0 = alloc_tmp( c );
1872 x0y1 = alloc_tmp( c );
1873 x1y0 = alloc_tmp( c );
1874 x1y1 = alloc_tmp( c );
1875 xi = alloc_tmp( c );
1876 yi = alloc_tmp( c );
1877 zi = alloc_tmp( c );
1878 t = alloc_tmp( c );
1879 for( i = 0; i < 8; i++ ) {
1880 tmp[ i ] = alloc_tmp( c );
1881 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1882 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1883 }
1884
1885 param0 = lookup_tmp( c, mark - 4 );
1886 param1 = lookup_tmp( c, mark - 3 );
1887 param2 = lookup_tmp( c, mark - 2 );
1888
1889 brw_set_access_mode( p, BRW_ALIGN_1 );
1890
1891 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1892 be hashed. Also compute the remainders (offsets within the unit
1893 cube), interleaved to reduce register dependency penalties. */
1894 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1895 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1896 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1897 brw_FRC( p, param0, param0 );
1898 brw_FRC( p, param1, param1 );
1899 brw_FRC( p, param2, param2 );
1900 /* Since we now have only 16 bits of precision in the hash, we must
1901 be more careful about thorough mixing to maintain entropy as we
1902 squash the input vector into a small scalar. */
1903 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1904 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1905 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1906 brw_imm_uw( 0x9B93 ) );
1907 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1908 brw_imm_uw( 0xBC8F ) );
1909
1910 /* Temporarily disable the execution mask while we work with ExecSize=16
1911 channels (the mask is set for ExecSize=8 and is probably incorrect).
1912 Although this might cause execution of unwanted channels, the code
1913 writes only to temporary registers and has no side effects, so
1914 disabling the mask is harmless. */
1915 brw_push_insn_state( p );
1916 brw_set_mask_control( p, BRW_MASK_DISABLE );
1917 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1918 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1919 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1920
1921 /* We're now ready to perform the hashing. The eight hashes are
1922 interleaved for performance. The hash function used is
1923 designed to rapidly achieve avalanche and require only 16x16
1924 bit multiplication, and 8-bit swizzles (which we get for
1925 free). */
1926 for( i = 0; i < 4; i++ )
1927 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1928 for( i = 0; i < 4; i++ )
1929 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1930 odd_bytes( wtmp[ i ] ) );
1931 for( i = 0; i < 4; i++ )
1932 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1933 for( i = 0; i < 4; i++ )
1934 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1935 odd_bytes( wtmp[ i ] ) );
1936 brw_pop_insn_state( p );
1937
1938 /* Now we want to initialise the four rear gradients based on the
1939 hashes. Format conversion from signed integer to float leaves
1940 everything scaled too high by a factor of pow( 2, 15 ), but
1941 we correct for that right at the end. */
1942 /* x component */
1943 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1944 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1945 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1946 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1947 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1948
1949 brw_push_insn_state( p );
1950 brw_set_mask_control( p, BRW_MASK_DISABLE );
1951 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1952 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1953 brw_pop_insn_state( p );
1954
1955 brw_MUL( p, x1y0, x1y0, t );
1956 brw_MUL( p, x1y1, x1y1, t );
1957 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1958 brw_MUL( p, x0y0, x0y0, param0 );
1959 brw_MUL( p, x0y1, x0y1, param0 );
1960
1961 /* y component */
1962 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1963 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1964 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1965 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1966
1967 brw_push_insn_state( p );
1968 brw_set_mask_control( p, BRW_MASK_DISABLE );
1969 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1970 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1971 brw_pop_insn_state( p );
1972
1973 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1974 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1975 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1976 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1977 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1978
1979 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1980 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1981 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1982 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1983
1984 /* z component */
1985 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1986 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1987 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1988 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1989
1990 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1991 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1992 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1993 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1994
1995 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1996 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1997 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1998 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1999
2000 /* We interpolate between the gradients using the polynomial
2001 6t^5 - 15t^4 + 10t^3 (Perlin). */
2002 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2003 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2004 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2005 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2006 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2007 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2008 brw_MUL( p, xi, xi, param0 );
2009 brw_MUL( p, yi, yi, param1 );
2010 brw_MUL( p, zi, zi, param2 );
2011 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2012 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2013 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2014 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2015 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2016 brw_MUL( p, xi, xi, param0 );
2017 brw_MUL( p, yi, yi, param1 );
2018 brw_MUL( p, zi, zi, param2 );
2019 brw_MUL( p, xi, xi, param0 );
2020 brw_MUL( p, yi, yi, param1 );
2021 brw_MUL( p, zi, zi, param2 );
2022 brw_MUL( p, xi, xi, param0 );
2023 brw_MUL( p, yi, yi, param1 );
2024 brw_MUL( p, zi, zi, param2 );
2025
2026 /* Here we interpolate in the y dimension... */
2027 brw_MUL( p, x0y1, x0y1, yi );
2028 brw_MUL( p, x1y1, x1y1, yi );
2029 brw_ADD( p, x0y0, x0y0, x0y1 );
2030 brw_ADD( p, x1y0, x1y0, x1y1 );
2031
2032 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2033 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2034 brw_MUL( p, x1y0, x1y0, xi );
2035 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2036
2037 /* Now do the same thing for the front four gradients... */
2038 /* x component */
2039 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2040 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2041 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2042 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2043
2044 brw_push_insn_state( p );
2045 brw_set_mask_control( p, BRW_MASK_DISABLE );
2046 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2047 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2048 brw_pop_insn_state( p );
2049
2050 brw_MUL( p, x1y0, x1y0, t );
2051 brw_MUL( p, x1y1, x1y1, t );
2052 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2053 brw_MUL( p, x0y0, x0y0, param0 );
2054 brw_MUL( p, x0y1, x0y1, param0 );
2055
2056 /* y component */
2057 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2058 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2059 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2060 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2061
2062 brw_push_insn_state( p );
2063 brw_set_mask_control( p, BRW_MASK_DISABLE );
2064 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2065 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2066 brw_pop_insn_state( p );
2067
2068 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2069 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2070 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2071 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2072 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2073
2074 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2075 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2076 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2077 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2078
2079 /* z component */
2080 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2081 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2082 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2083 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2084
2085 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2086 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2087 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2088 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2089
2090 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2091 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2092 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2093 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2094
2095 /* The interpolation coefficients are still around from last time, so
2096 again interpolate in the y dimension... */
2097 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2098 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2099 brw_MUL( p, x0y1, x0y1, yi );
2100 brw_MUL( p, x1y1, x1y1, yi );
2101 brw_ADD( p, x0y0, x0y0, x0y1 );
2102 brw_ADD( p, x1y0, x1y0, x1y1 );
2103
2104 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2105 time put the front face in tmp[ 1 ] and we're nearly there... */
2106 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2107 brw_MUL( p, x1y0, x1y0, xi );
2108 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2109
2110 /* The final interpolation, in the z dimension: */
2111 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2112 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2113 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2114
2115 /* scale by pow( 2, -15 ), as described above */
2116 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2117
2118 release_tmps( c, mark );
2119 }
2120
2121 static void emit_noise3( struct brw_wm_compile *c,
2122 const struct prog_instruction *inst )
2123 {
2124 struct brw_compile *p = &c->func;
2125 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2126 GLuint mask = inst->DstReg.WriteMask;
2127 int i;
2128 int mark = mark_tmps( c );
2129
2130 assert( mark == 0 );
2131
2132 src0 = get_src_reg( c, inst, 0, 0 );
2133 src1 = get_src_reg( c, inst, 0, 1 );
2134 src2 = get_src_reg( c, inst, 0, 2 );
2135
2136 param0 = alloc_tmp( c );
2137 param1 = alloc_tmp( c );
2138 param2 = alloc_tmp( c );
2139
2140 brw_MOV( p, param0, src0 );
2141 brw_MOV( p, param1, src1 );
2142 brw_MOV( p, param2, src2 );
2143
2144 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2145
2146 /* Fill in the result: */
2147 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2148 for (i = 0 ; i < 4; i++) {
2149 if (mask & (1<<i)) {
2150 dst = get_dst_reg(c, inst, i);
2151 brw_MOV( p, dst, param0 );
2152 }
2153 }
2154 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2155 brw_set_saturate( p, 0 );
2156
2157 release_tmps( c, mark );
2158 }
2159
2160 /**
2161 * For the four-dimensional case, the little micro-optimisation benefits
2162 * we obtain by unrolling all the loops aren't worth the massive bloat it
2163 * now causes. Instead, we loop twice around performing a similar operation
2164 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2165 * code to glue it all together.
2166 */
2167 static void noise4_sub( struct brw_wm_compile *c )
2168 {
2169 struct brw_compile *p = &c->func;
2170 struct brw_reg param[ 4 ],
2171 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2172 w0, /* noise for the w=0 cube */
2173 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2174 interp[ 4 ], /* interpolation coefficients */
2175 t, tmp[ 8 ], /* float temporaries */
2176 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2177 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2178 int i, j;
2179 int mark = mark_tmps( c );
2180 GLuint loop, origin;
2181
2182 x0y0 = alloc_tmp( c );
2183 x0y1 = alloc_tmp( c );
2184 x1y0 = alloc_tmp( c );
2185 x1y1 = alloc_tmp( c );
2186 t = alloc_tmp( c );
2187 w0 = alloc_tmp( c );
2188 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2189 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2190
2191 for( i = 0; i < 4; i++ ) {
2192 param[ i ] = lookup_tmp( c, mark - 5 + i );
2193 interp[ i ] = alloc_tmp( c );
2194 }
2195
2196 for( i = 0; i < 8; i++ ) {
2197 tmp[ i ] = alloc_tmp( c );
2198 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2199 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2200 }
2201
2202 brw_set_access_mode( p, BRW_ALIGN_1 );
2203
2204 /* We only want 16 bits of precision from the integral part of each
2205 co-ordinate, but unfortunately the RNDD semantics would saturate
2206 at 16 bits if we performed the operation directly to a 16-bit
2207 destination. Therefore, we round to 32-bit temporaries where
2208 appropriate, and then store only the lower 16 bits. */
2209 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2210 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2211 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2212 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2213 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2214 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2215
2216 /* Modify the flag register here, because the side effect is useful
2217 later (see below). We know for certain that all flags will be
2218 cleared, since the FRC instruction cannot possibly generate
2219 negative results. Even for exceptional inputs (infinities, denormals,
2220 NaNs), the architecture guarantees that the L conditional is false. */
2221 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2222 brw_FRC( p, param[ 0 ], param[ 0 ] );
2223 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2224 for( i = 1; i < 4; i++ )
2225 brw_FRC( p, param[ i ], param[ i ] );
2226
2227 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2228 of all. */
2229 for( i = 0; i < 4; i++ )
2230 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2231 for( i = 0; i < 4; i++ )
2232 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2233 for( i = 0; i < 4; i++ )
2234 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2235 for( i = 0; i < 4; i++ )
2236 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2237 for( j = 0; j < 3; j++ )
2238 for( i = 0; i < 4; i++ )
2239 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2240
2241 /* Mark the current address, as it will be a jump destination. The
2242 following code will be executed twice: first, with the flag
2243 register clear indicating the w=0 case, and second with flags
2244 set for w=1. */
2245 loop = p->nr_insn;
2246
2247 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2248 be hashed. Since we have only 16 bits of precision in the hash, we
2249 must be careful about thorough mixing to maintain entropy as we
2250 squash the input vector into a small scalar. */
2251 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2252 brw_imm_uw( 0xBC8F ) );
2253 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2254 brw_imm_uw( 0xD0BD ) );
2255 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2256 brw_imm_uw( 0x9B93 ) );
2257 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2258 brw_imm_uw( 0xA359 ) );
2259 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2260 brw_imm_uw( 0xBC8F ) );
2261
2262 /* Temporarily disable the execution mask while we work with ExecSize=16
2263 channels (the mask is set for ExecSize=8 and is probably incorrect).
2264 Although this might cause execution of unwanted channels, the code
2265 writes only to temporary registers and has no side effects, so
2266 disabling the mask is harmless. */
2267 brw_push_insn_state( p );
2268 brw_set_mask_control( p, BRW_MASK_DISABLE );
2269 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2270 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2271 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2272
2273 /* We're now ready to perform the hashing. The eight hashes are
2274 interleaved for performance. The hash function used is
2275 designed to rapidly achieve avalanche and require only 16x16
2276 bit multiplication, and 8-bit swizzles (which we get for
2277 free). */
2278 for( i = 0; i < 4; i++ )
2279 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2280 for( i = 0; i < 4; i++ )
2281 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2282 odd_bytes( wtmp[ i ] ) );
2283 for( i = 0; i < 4; i++ )
2284 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2285 for( i = 0; i < 4; i++ )
2286 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2287 odd_bytes( wtmp[ i ] ) );
2288 brw_pop_insn_state( p );
2289
2290 /* Now we want to initialise the four rear gradients based on the
2291 hashes. Format conversion from signed integer to float leaves
2292 everything scaled too high by a factor of pow( 2, 15 ), but
2293 we correct for that right at the end. */
2294 /* x component */
2295 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2296 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2297 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2298 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2299 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2300
2301 brw_push_insn_state( p );
2302 brw_set_mask_control( p, BRW_MASK_DISABLE );
2303 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2304 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2305 brw_pop_insn_state( p );
2306
2307 brw_MUL( p, x1y0, x1y0, t );
2308 brw_MUL( p, x1y1, x1y1, t );
2309 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2310 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2311 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2312
2313 /* y component */
2314 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2315 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2316 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2317 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2318
2319 brw_push_insn_state( p );
2320 brw_set_mask_control( p, BRW_MASK_DISABLE );
2321 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2322 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2323 brw_pop_insn_state( p );
2324
2325 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2326 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2327 /* prepare t for the w component (used below): w the first time through
2328 the loop; w - 1 the second time) */
2329 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2330 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2331 p->current->header.predicate_inverse = 1;
2332 brw_MOV( p, t, param[ 3 ] );
2333 p->current->header.predicate_inverse = 0;
2334 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2335 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2336 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2337
2338 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2339 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2340 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2341 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2342
2343 /* z component */
2344 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2345 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2346 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2347 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2348
2349 brw_push_insn_state( p );
2350 brw_set_mask_control( p, BRW_MASK_DISABLE );
2351 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2352 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2353 brw_pop_insn_state( p );
2354
2355 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2356 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2357 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2358 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2359
2360 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2361 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2362 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2363 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2364
2365 /* w component */
2366 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2367 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2368 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2369 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2370
2371 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2372 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2373 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2374 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2375 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2376
2377 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2378 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2379 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2380 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2381
2382 /* Here we interpolate in the y dimension... */
2383 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2384 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2385 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2386 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2387 brw_ADD( p, x0y0, x0y0, x0y1 );
2388 brw_ADD( p, x1y0, x1y0, x1y1 );
2389
2390 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2391 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2392 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2393 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2394
2395 /* Now do the same thing for the front four gradients... */
2396 /* x component */
2397 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2398 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2399 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2400 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2401
2402 brw_push_insn_state( p );
2403 brw_set_mask_control( p, BRW_MASK_DISABLE );
2404 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2405 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2406 brw_pop_insn_state( p );
2407
2408 brw_MUL( p, x1y0, x1y0, t );
2409 brw_MUL( p, x1y1, x1y1, t );
2410 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2411 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2412 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2413
2414 /* y component */
2415 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2416 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2417 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2418 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2419
2420 brw_push_insn_state( p );
2421 brw_set_mask_control( p, BRW_MASK_DISABLE );
2422 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2423 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2424 brw_pop_insn_state( p );
2425
2426 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2427 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2428 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2429 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2430 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2431
2432 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2433 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2434 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2435 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2436
2437 /* z component */
2438 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2439 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2440 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2441 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2442
2443 brw_push_insn_state( p );
2444 brw_set_mask_control( p, BRW_MASK_DISABLE );
2445 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2446 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2447 brw_pop_insn_state( p );
2448
2449 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2450 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2451 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2452 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2453 /* prepare t for the w component (used below): w the first time through
2454 the loop; w - 1 the second time) */
2455 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2456 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2457 p->current->header.predicate_inverse = 1;
2458 brw_MOV( p, t, param[ 3 ] );
2459 p->current->header.predicate_inverse = 0;
2460 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2461
2462 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2463 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2464 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2465 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2466
2467 /* w component */
2468 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2469 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2470 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2471 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2472
2473 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2474 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2475 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2476 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2477
2478 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2479 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2480 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2481 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2482
2483 /* Interpolate in the y dimension: */
2484 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2485 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2486 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2487 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2488 brw_ADD( p, x0y0, x0y0, x0y1 );
2489 brw_ADD( p, x1y0, x1y0, x1y1 );
2490
2491 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2492 time put the front face in tmp[ 1 ] and we're nearly there... */
2493 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2494 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2495 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2496
2497 /* Another interpolation, in the z dimension: */
2498 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2499 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2500 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2501
2502 /* Exit the loop if we've computed both cubes... */
2503 origin = p->nr_insn;
2504 brw_push_insn_state( p );
2505 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2506 brw_set_mask_control( p, BRW_MASK_DISABLE );
2507 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2508 brw_pop_insn_state( p );
2509
2510 /* Save the result for the w=0 case, and increment the w coordinate: */
2511 brw_MOV( p, w0, tmp[ 0 ] );
2512 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2513 brw_imm_uw( 1 ) );
2514
2515 /* Loop around for the other cube. Explicitly set the flag register
2516 (unfortunately we must spend an extra instruction to do this: we
2517 can't rely on a side effect of the previous MOV or ADD because
2518 conditional modifiers which are normally true might be false in
2519 exceptional circumstances, e.g. given a NaN input; the add to
2520 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2521 brw_push_insn_state( p );
2522 brw_set_mask_control( p, BRW_MASK_DISABLE );
2523 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2524 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2525 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2526 brw_pop_insn_state( p );
2527
2528 /* Patch the previous conditional branch now that we know the
2529 destination address. */
2530 brw_set_src1( p->store + origin,
2531 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2532
2533 /* The very last interpolation. */
2534 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2535 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2536 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2537
2538 /* scale by pow( 2, -15 ), as described above */
2539 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2540
2541 release_tmps( c, mark );
2542 }
2543
2544 static void emit_noise4( struct brw_wm_compile *c,
2545 const struct prog_instruction *inst )
2546 {
2547 struct brw_compile *p = &c->func;
2548 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2549 GLuint mask = inst->DstReg.WriteMask;
2550 int i;
2551 int mark = mark_tmps( c );
2552
2553 assert( mark == 0 );
2554
2555 src0 = get_src_reg( c, inst, 0, 0 );
2556 src1 = get_src_reg( c, inst, 0, 1 );
2557 src2 = get_src_reg( c, inst, 0, 2 );
2558 src3 = get_src_reg( c, inst, 0, 3 );
2559
2560 param0 = alloc_tmp( c );
2561 param1 = alloc_tmp( c );
2562 param2 = alloc_tmp( c );
2563 param3 = alloc_tmp( c );
2564
2565 brw_MOV( p, param0, src0 );
2566 brw_MOV( p, param1, src1 );
2567 brw_MOV( p, param2, src2 );
2568 brw_MOV( p, param3, src3 );
2569
2570 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2571
2572 /* Fill in the result: */
2573 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2574 for (i = 0 ; i < 4; i++) {
2575 if (mask & (1<<i)) {
2576 dst = get_dst_reg(c, inst, i);
2577 brw_MOV( p, dst, param0 );
2578 }
2579 }
2580 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2581 brw_set_saturate( p, 0 );
2582
2583 release_tmps( c, mark );
2584 }
2585
2586 static void emit_wpos_xy(struct brw_wm_compile *c,
2587 const struct prog_instruction *inst)
2588 {
2589 struct brw_compile *p = &c->func;
2590 GLuint mask = inst->DstReg.WriteMask;
2591 struct brw_reg src0[2], dst[2];
2592
2593 dst[0] = get_dst_reg(c, inst, 0);
2594 dst[1] = get_dst_reg(c, inst, 1);
2595
2596 src0[0] = get_src_reg(c, inst, 0, 0);
2597 src0[1] = get_src_reg(c, inst, 0, 1);
2598
2599 /* Calculate the pixel offset from window bottom left into destination
2600 * X and Y channels.
2601 */
2602 if (mask & WRITEMASK_X) {
2603 /* X' = X - origin_x */
2604 brw_ADD(p,
2605 dst[0],
2606 retype(src0[0], BRW_REGISTER_TYPE_W),
2607 brw_imm_d(0 - c->key.origin_x));
2608 }
2609
2610 if (mask & WRITEMASK_Y) {
2611 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2612 brw_ADD(p,
2613 dst[1],
2614 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2615 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2616 }
2617 }
2618
2619 /* TODO
2620 BIAS on SIMD8 not working yet...
2621 */
2622 static void emit_txb(struct brw_wm_compile *c,
2623 const struct prog_instruction *inst)
2624 {
2625 struct brw_compile *p = &c->func;
2626 struct brw_reg dst[4], src[4], payload_reg;
2627 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2628 GLuint i;
2629 GLuint msg_type;
2630
2631 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2632
2633 for (i = 0; i < 4; i++)
2634 dst[i] = get_dst_reg(c, inst, i);
2635 for (i = 0; i < 4; i++)
2636 src[i] = get_src_reg(c, inst, 0, i);
2637
2638 switch (inst->TexSrcTarget) {
2639 case TEXTURE_1D_INDEX:
2640 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2641 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2642 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2643 break;
2644 case TEXTURE_2D_INDEX:
2645 case TEXTURE_RECT_INDEX:
2646 brw_MOV(p, brw_message_reg(2), src[0]);
2647 brw_MOV(p, brw_message_reg(3), src[1]);
2648 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2649 break;
2650 default:
2651 brw_MOV(p, brw_message_reg(2), src[0]);
2652 brw_MOV(p, brw_message_reg(3), src[1]);
2653 brw_MOV(p, brw_message_reg(4), src[2]);
2654 break;
2655 }
2656 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2657 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2658
2659 if (BRW_IS_IGDNG(p->brw)) {
2660 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2661 } else {
2662 /* Does it work well on SIMD8? */
2663 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2664 }
2665
2666 brw_SAMPLE(p,
2667 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2668 1, /* msg_reg_nr */
2669 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2670 SURF_INDEX_TEXTURE(unit),
2671 unit, /* sampler */
2672 inst->DstReg.WriteMask, /* writemask */
2673 msg_type, /* msg_type */
2674 4, /* response_length */
2675 4, /* msg_length */
2676 0, /* eot */
2677 1,
2678 BRW_SAMPLER_SIMD_MODE_SIMD8);
2679 }
2680
2681
2682 static void emit_tex(struct brw_wm_compile *c,
2683 const struct prog_instruction *inst)
2684 {
2685 struct brw_compile *p = &c->func;
2686 struct brw_reg dst[4], src[4], payload_reg;
2687 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2688 GLuint msg_len;
2689 GLuint i, nr;
2690 GLuint emit;
2691 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2692 GLuint msg_type;
2693
2694 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2695
2696 for (i = 0; i < 4; i++)
2697 dst[i] = get_dst_reg(c, inst, i);
2698 for (i = 0; i < 4; i++)
2699 src[i] = get_src_reg(c, inst, 0, i);
2700
2701 switch (inst->TexSrcTarget) {
2702 case TEXTURE_1D_INDEX:
2703 emit = WRITEMASK_X;
2704 nr = 1;
2705 break;
2706 case TEXTURE_2D_INDEX:
2707 case TEXTURE_RECT_INDEX:
2708 emit = WRITEMASK_XY;
2709 nr = 2;
2710 break;
2711 default:
2712 emit = WRITEMASK_XYZ;
2713 nr = 3;
2714 break;
2715 }
2716 msg_len = 1;
2717
2718 /* move/load S, T, R coords */
2719 for (i = 0; i < nr; i++) {
2720 static const GLuint swz[4] = {0,1,2,2};
2721 if (emit & (1<<i))
2722 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2723 else
2724 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2725 msg_len += 1;
2726 }
2727
2728 if (shadow) {
2729 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2730 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2731 }
2732
2733 if (BRW_IS_IGDNG(p->brw)) {
2734 if (shadow)
2735 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2736 else
2737 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2738 } else {
2739 /* Does it work for shadow on SIMD8 ? */
2740 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2741 }
2742
2743 brw_SAMPLE(p,
2744 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2745 1, /* msg_reg_nr */
2746 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2747 SURF_INDEX_TEXTURE(unit),
2748 unit, /* sampler */
2749 inst->DstReg.WriteMask, /* writemask */
2750 msg_type, /* msg_type */
2751 4, /* response_length */
2752 shadow ? 6 : 4, /* msg_length */
2753 0, /* eot */
2754 1,
2755 BRW_SAMPLER_SIMD_MODE_SIMD8);
2756
2757 if (shadow)
2758 brw_MOV(p, dst[3], brw_imm_f(1.0));
2759 }
2760
2761
2762 /**
2763 * Resolve subroutine calls after code emit is done.
2764 */
2765 static void post_wm_emit( struct brw_wm_compile *c )
2766 {
2767 brw_resolve_cals(&c->func);
2768 }
2769
2770 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2771 {
2772 #define MAX_IF_DEPTH 32
2773 #define MAX_LOOP_DEPTH 32
2774 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2775 GLuint i, if_depth = 0, loop_depth = 0;
2776 struct brw_compile *p = &c->func;
2777 struct brw_indirect stack_index = brw_indirect(0, 0);
2778
2779 c->out_of_regs = GL_FALSE;
2780
2781 prealloc_reg(c);
2782 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2783 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2784
2785 for (i = 0; i < c->nr_fp_insns; i++) {
2786 const struct prog_instruction *inst = &c->prog_instructions[i];
2787
2788 c->cur_inst = i;
2789
2790 #if 0
2791 _mesa_printf("Inst %d: ", i);
2792 _mesa_print_instruction(inst);
2793 #endif
2794
2795 /* fetch any constants that this instruction needs */
2796 if (c->fp->use_const_buffer)
2797 fetch_constants(c, inst);
2798
2799 if (inst->CondUpdate)
2800 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2801 else
2802 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2803
2804 switch (inst->Opcode) {
2805 case WM_PIXELXY:
2806 emit_pixel_xy(c, inst);
2807 break;
2808 case WM_DELTAXY:
2809 emit_delta_xy(c, inst);
2810 break;
2811 case WM_PIXELW:
2812 emit_pixel_w(c, inst);
2813 break;
2814 case WM_LINTERP:
2815 emit_linterp(c, inst);
2816 break;
2817 case WM_PINTERP:
2818 emit_pinterp(c, inst);
2819 break;
2820 case WM_CINTERP:
2821 emit_cinterp(c, inst);
2822 break;
2823 case WM_WPOSXY:
2824 emit_wpos_xy(c, inst);
2825 break;
2826 case WM_FB_WRITE:
2827 emit_fb_write(c, inst);
2828 break;
2829 case WM_FRONTFACING:
2830 emit_frontfacing(c, inst);
2831 break;
2832 case OPCODE_ABS:
2833 emit_abs(c, inst);
2834 break;
2835 case OPCODE_ADD:
2836 emit_add(c, inst);
2837 break;
2838 case OPCODE_ARL:
2839 emit_arl(c, inst);
2840 break;
2841 case OPCODE_SUB:
2842 emit_sub(c, inst);
2843 break;
2844 case OPCODE_FRC:
2845 emit_frc(c, inst);
2846 break;
2847 case OPCODE_FLR:
2848 emit_flr(c, inst);
2849 break;
2850 case OPCODE_LRP:
2851 emit_lrp(c, inst);
2852 break;
2853 case OPCODE_TRUNC:
2854 emit_trunc(c, inst);
2855 break;
2856 case OPCODE_MOV:
2857 case OPCODE_SWZ:
2858 emit_mov(c, inst);
2859 break;
2860 case OPCODE_DP3:
2861 emit_dp3(c, inst);
2862 break;
2863 case OPCODE_DP4:
2864 emit_dp4(c, inst);
2865 break;
2866 case OPCODE_XPD:
2867 emit_xpd(c, inst);
2868 break;
2869 case OPCODE_DPH:
2870 emit_dph(c, inst);
2871 break;
2872 case OPCODE_RCP:
2873 emit_rcp(c, inst);
2874 break;
2875 case OPCODE_RSQ:
2876 emit_rsq(c, inst);
2877 break;
2878 case OPCODE_SIN:
2879 emit_sin(c, inst);
2880 break;
2881 case OPCODE_COS:
2882 emit_cos(c, inst);
2883 break;
2884 case OPCODE_EX2:
2885 emit_ex2(c, inst);
2886 break;
2887 case OPCODE_LG2:
2888 emit_lg2(c, inst);
2889 break;
2890 case OPCODE_MIN:
2891 case OPCODE_MAX:
2892 emit_min_max(c, inst);
2893 break;
2894 case OPCODE_DDX:
2895 emit_ddx(c, inst);
2896 break;
2897 case OPCODE_DDY:
2898 emit_ddy(c, inst);
2899 break;
2900 case OPCODE_SLT:
2901 emit_slt(c, inst);
2902 break;
2903 case OPCODE_SLE:
2904 emit_sle(c, inst);
2905 break;
2906 case OPCODE_SGT:
2907 emit_sgt(c, inst);
2908 break;
2909 case OPCODE_SGE:
2910 emit_sge(c, inst);
2911 break;
2912 case OPCODE_SEQ:
2913 emit_seq(c, inst);
2914 break;
2915 case OPCODE_SNE:
2916 emit_sne(c, inst);
2917 break;
2918 case OPCODE_MUL:
2919 emit_mul(c, inst);
2920 break;
2921 case OPCODE_POW:
2922 emit_pow(c, inst);
2923 break;
2924 case OPCODE_MAD:
2925 emit_mad(c, inst);
2926 break;
2927 case OPCODE_NOISE1:
2928 emit_noise1(c, inst);
2929 break;
2930 case OPCODE_NOISE2:
2931 emit_noise2(c, inst);
2932 break;
2933 case OPCODE_NOISE3:
2934 emit_noise3(c, inst);
2935 break;
2936 case OPCODE_NOISE4:
2937 emit_noise4(c, inst);
2938 break;
2939 case OPCODE_TEX:
2940 emit_tex(c, inst);
2941 break;
2942 case OPCODE_TXB:
2943 emit_txb(c, inst);
2944 break;
2945 case OPCODE_KIL_NV:
2946 emit_kil(c);
2947 break;
2948 case OPCODE_IF:
2949 assert(if_depth < MAX_IF_DEPTH);
2950 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2951 break;
2952 case OPCODE_ELSE:
2953 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2954 break;
2955 case OPCODE_ENDIF:
2956 assert(if_depth > 0);
2957 brw_ENDIF(p, if_inst[--if_depth]);
2958 break;
2959 case OPCODE_BGNSUB:
2960 brw_save_label(p, inst->Comment, p->nr_insn);
2961 break;
2962 case OPCODE_ENDSUB:
2963 /* no-op */
2964 break;
2965 case OPCODE_CAL:
2966 brw_push_insn_state(p);
2967 brw_set_mask_control(p, BRW_MASK_DISABLE);
2968 brw_set_access_mode(p, BRW_ALIGN_1);
2969 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2970 brw_set_access_mode(p, BRW_ALIGN_16);
2971 brw_ADD(p, get_addr_reg(stack_index),
2972 get_addr_reg(stack_index), brw_imm_d(4));
2973 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2974 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2975 brw_pop_insn_state(p);
2976 break;
2977
2978 case OPCODE_RET:
2979 brw_push_insn_state(p);
2980 brw_set_mask_control(p, BRW_MASK_DISABLE);
2981 brw_ADD(p, get_addr_reg(stack_index),
2982 get_addr_reg(stack_index), brw_imm_d(-4));
2983 brw_set_access_mode(p, BRW_ALIGN_1);
2984 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2985 brw_set_access_mode(p, BRW_ALIGN_16);
2986 brw_pop_insn_state(p);
2987
2988 break;
2989 case OPCODE_BGNLOOP:
2990 /* XXX may need to invalidate the current_constant regs */
2991 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2992 break;
2993 case OPCODE_BRK:
2994 brw_BREAK(p);
2995 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2996 break;
2997 case OPCODE_CONT:
2998 brw_CONT(p);
2999 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3000 break;
3001 case OPCODE_ENDLOOP:
3002 {
3003 struct brw_instruction *inst0, *inst1;
3004 GLuint br = 1;
3005
3006 if (BRW_IS_IGDNG(brw))
3007 br = 2;
3008
3009 loop_depth--;
3010 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
3011 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3012 while (inst0 > loop_inst[loop_depth]) {
3013 inst0--;
3014 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
3015 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3016 inst0->bits3.if_else.pop_count = 0;
3017 }
3018 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3019 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3020 inst0->bits3.if_else.pop_count = 0;
3021 }
3022 }
3023 }
3024 break;
3025 default:
3026 _mesa_printf("unsupported IR in fragment shader %d\n",
3027 inst->Opcode);
3028 }
3029
3030 if (inst->CondUpdate)
3031 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3032 else
3033 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3034 }
3035 post_wm_emit(c);
3036
3037 if (INTEL_DEBUG & DEBUG_WM) {
3038 _mesa_printf("wm-native:\n");
3039 for (i = 0; i < p->nr_insn; i++)
3040 brw_disasm(stderr, &p->store[i]);
3041 _mesa_printf("\n");
3042 }
3043 }
3044
3045
3046 /**
3047 * Do GPU code generation for shaders that use GLSL features such as
3048 * flow control. Other shaders will be compiled with the
3049 */
3050 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3051 {
3052 if (INTEL_DEBUG & DEBUG_WM) {
3053 _mesa_printf("brw_wm_glsl_emit:\n");
3054 }
3055
3056 /* initial instruction translation/simplification */
3057 brw_wm_pass_fp(c);
3058
3059 /* actual code generation */
3060 brw_wm_emit_glsl(brw, c);
3061
3062 if (INTEL_DEBUG & DEBUG_WM) {
3063 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3064 }
3065
3066 c->prog_data.total_grf = num_grf_used(c);
3067 c->prog_data.total_scratch = 0;
3068 }