i965: new SURF_INDEX_ macros
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195
196 /* use a real constant buffer, or just use a section of the GRF? */
197 c->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
198
199 if (c->use_const_buffer) {
200 /* We'll use a real constant buffer and fetch constants from
201 * it with a dataport read message.
202 */
203
204 /* number of float constants in CURBE */
205 c->prog_data.nr_params = 0;
206 }
207 else {
208 const struct gl_program_parameter_list *plist =
209 c->fp->program.Base.Parameters;
210 int index = 0;
211
212 /* number of float constants in CURBE */
213 c->prog_data.nr_params = 4 * nr_params;
214
215 /* loop over program constants (float[4]) */
216 for (i = 0; i < nr_params; i++) {
217 /* loop over XYZW channels */
218 for (j = 0; j < 4; j++, index++) {
219 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
220 /* Save pointer to parameter/constant value.
221 * Constants will be copied in prepare_constant_buffer()
222 */
223 c->prog_data.param[index] = &plist->ParameterValues[i][j];
224 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
225 }
226 }
227 /* number of constant regs used (each reg is float[8]) */
228 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
229 c->reg_index += c->nr_creg;
230 }
231 }
232
233 /* fragment shader inputs */
234 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
235 if (inputs & (1<<i)) {
236 nr_interp_regs++;
237 reg = brw_vec8_grf(c->reg_index, 0);
238 for (j = 0; j < 4; j++)
239 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
240 c->reg_index += 2;
241 }
242 }
243
244 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
245 c->prog_data.urb_read_length = nr_interp_regs * 2;
246 c->prog_data.curb_read_length = c->nr_creg;
247 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
248 c->reg_index++;
249 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
250 c->reg_index += 2;
251
252 /* An instruction may reference up to three constants.
253 * They'll be found in these registers.
254 * XXX alloc these on demand!
255 */
256 if (c->use_const_buffer) {
257 for (i = 0; i < 3; i++) {
258 c->current_const[i].index = -1;
259 c->current_const[i].reg = alloc_tmp(c);
260 }
261 }
262 #if 0
263 printf("USE CONST BUFFER? %d\n", c->use_const_buffer);
264 printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
265 #endif
266 }
267
268
269 /**
270 * Check if any of the instruction's src registers are constants, uniforms,
271 * or statevars. If so, fetch any constants that we don't already have in
272 * the three GRF slots.
273 */
274 static void fetch_constants(struct brw_wm_compile *c,
275 const struct prog_instruction *inst)
276 {
277 struct brw_compile *p = &c->func;
278 GLuint i;
279
280 /* loop over instruction src regs */
281 for (i = 0; i < 3; i++) {
282 const struct prog_src_register *src = &inst->SrcReg[i];
283 if (src->File == PROGRAM_STATE_VAR ||
284 src->File == PROGRAM_CONSTANT ||
285 src->File == PROGRAM_UNIFORM) {
286 if (c->current_const[i].index != src->Index) {
287 c->current_const[i].index = src->Index;
288
289 #if 0
290 printf(" fetch const[%d] for arg %d into reg %d\n",
291 src->Index, i, c->current_const[i].reg.nr);
292 #endif
293
294 /* need to fetch the constant now */
295 brw_dp_READ_4(p,
296 c->current_const[i].reg, /* writeback dest */
297 1, /* msg_reg */
298 src->RelAddr, /* relative indexing? */
299 16 * src->Index, /* byte offset */
300 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
301 );
302 }
303 }
304 }
305 }
306
307
308 /**
309 * Convert Mesa dst register to brw register.
310 */
311 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
312 const struct prog_instruction *inst,
313 GLuint component)
314 {
315 const int nr = 1;
316 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
317 0, 0);
318 }
319
320
321 static struct brw_reg
322 get_src_reg_const(struct brw_wm_compile *c,
323 const struct prog_instruction *inst,
324 GLuint srcRegIndex, GLuint component)
325 {
326 /* We should have already fetched the constant from the constant
327 * buffer in fetch_constants(). Now we just have to return a
328 * register description that extracts the needed component and
329 * smears it across all eight vector components.
330 */
331 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
332 struct brw_reg const_reg;
333
334 assert(component < 4);
335 assert(srcRegIndex < 3);
336 assert(c->current_const[srcRegIndex].index != -1);
337 const_reg = c->current_const[srcRegIndex].reg;
338
339 /* extract desired float from the const_reg, and smear */
340 const_reg = stride(const_reg, 0, 1, 0);
341 const_reg.subnr = component * 4;
342
343 if (src->NegateBase)
344 const_reg = negate(const_reg);
345 if (src->Abs)
346 const_reg = brw_abs(const_reg);
347
348 #if 0
349 printf(" form const[%d] for arg %d, comp %d, reg %d\n",
350 c->current_const[srcRegIndex].index,
351 srcRegIndex,
352 component,
353 const_reg.nr);
354 #endif
355
356 return const_reg;
357 }
358
359
360 /**
361 * Convert Mesa src register to brw register.
362 */
363 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
364 const struct prog_instruction *inst,
365 GLuint srcRegIndex, GLuint channel)
366 {
367 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
368 const GLuint nr = 1;
369 const GLuint component = GET_SWZ(src->Swizzle, channel);
370
371 if (c->use_const_buffer &&
372 (src->File == PROGRAM_STATE_VAR ||
373 src->File == PROGRAM_CONSTANT ||
374 src->File == PROGRAM_UNIFORM)) {
375 return get_src_reg_const(c, inst, srcRegIndex, component);
376 }
377 else {
378 /* other type of source register */
379 return get_reg(c, src->File, src->Index, component, nr,
380 src->NegateBase, src->Abs);
381 }
382 }
383
384
385 /**
386 * Same as \sa get_src_reg() but if the register is a literal, emit
387 * a brw_reg encoding the literal.
388 * Note that a brw instruction only allows one src operand to be a literal.
389 * For instructions with more than one operand, only the second can be a
390 * literal. This means that we treat some literals as constants/uniforms
391 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
392 *
393 */
394 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
395 const struct prog_instruction *inst,
396 GLuint srcRegIndex, GLuint channel)
397 {
398 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
399 if (src->File == PROGRAM_CONSTANT) {
400 /* a literal */
401 const int component = GET_SWZ(src->Swizzle, channel);
402 const GLfloat *param =
403 c->fp->program.Base.Parameters->ParameterValues[src->Index];
404 GLfloat value = param[component];
405 if (src->NegateBase)
406 value = -value;
407 if (src->Abs)
408 value = FABSF(value);
409 #if 0
410 printf(" form imm reg %f\n", value);
411 #endif
412 return brw_imm_f(value);
413 }
414 else {
415 return get_src_reg(c, inst, srcRegIndex, channel);
416 }
417 }
418
419
420 /**
421 * Subroutines are minimal support for resusable instruction sequences.
422 * They are implemented as simply as possible to minimise overhead: there
423 * is no explicit support for communication between the caller and callee
424 * other than saving the return address in a temporary register, nor is
425 * there any automatic local storage. This implies that great care is
426 * required before attempting reentrancy or any kind of nested
427 * subroutine invocations.
428 */
429 static void invoke_subroutine( struct brw_wm_compile *c,
430 enum _subroutine subroutine,
431 void (*emit)( struct brw_wm_compile * ) )
432 {
433 struct brw_compile *p = &c->func;
434
435 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
436
437 if( c->subroutines[ subroutine ] ) {
438 /* subroutine previously emitted: reuse existing instructions */
439
440 int mark = mark_tmps( c );
441 struct brw_reg return_address = retype( alloc_tmp( c ),
442 BRW_REGISTER_TYPE_UD );
443 int here = p->nr_insn;
444
445 brw_push_insn_state(p);
446 brw_set_mask_control(p, BRW_MASK_DISABLE);
447 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
448
449 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
450 brw_imm_d( ( c->subroutines[ subroutine ] -
451 here - 1 ) << 4 ) );
452 brw_pop_insn_state(p);
453
454 release_tmps( c, mark );
455 } else {
456 /* previously unused subroutine: emit, and mark for later reuse */
457
458 int mark = mark_tmps( c );
459 struct brw_reg return_address = retype( alloc_tmp( c ),
460 BRW_REGISTER_TYPE_UD );
461 struct brw_instruction *calc;
462 int base = p->nr_insn;
463
464 brw_push_insn_state(p);
465 brw_set_mask_control(p, BRW_MASK_DISABLE);
466 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
467 brw_pop_insn_state(p);
468
469 c->subroutines[ subroutine ] = p->nr_insn;
470
471 emit( c );
472
473 brw_push_insn_state(p);
474 brw_set_mask_control(p, BRW_MASK_DISABLE);
475 brw_MOV( p, brw_ip_reg(), return_address );
476 brw_pop_insn_state(p);
477
478 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
479
480 release_tmps( c, mark );
481 }
482 }
483
484 static void emit_abs( struct brw_wm_compile *c,
485 struct prog_instruction *inst)
486 {
487 int i;
488 struct brw_compile *p = &c->func;
489 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
490 for (i = 0; i < 4; i++) {
491 if (inst->DstReg.WriteMask & (1<<i)) {
492 struct brw_reg src, dst;
493 dst = get_dst_reg(c, inst, i);
494 src = get_src_reg(c, inst, 0, i);
495 brw_MOV(p, dst, brw_abs(src));
496 }
497 }
498 brw_set_saturate(p, 0);
499 }
500
501 static void emit_trunc( struct brw_wm_compile *c,
502 struct prog_instruction *inst)
503 {
504 int i;
505 struct brw_compile *p = &c->func;
506 GLuint mask = inst->DstReg.WriteMask;
507 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
508 for (i = 0; i < 4; i++) {
509 if (mask & (1<<i)) {
510 struct brw_reg src, dst;
511 dst = get_dst_reg(c, inst, i);
512 src = get_src_reg(c, inst, 0, i);
513 brw_RNDZ(p, dst, src);
514 }
515 }
516 brw_set_saturate(p, 0);
517 }
518
519 static void emit_mov( struct brw_wm_compile *c,
520 struct prog_instruction *inst)
521 {
522 int i;
523 struct brw_compile *p = &c->func;
524 GLuint mask = inst->DstReg.WriteMask;
525 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
526 for (i = 0; i < 4; i++) {
527 if (mask & (1<<i)) {
528 struct brw_reg src, dst;
529 dst = get_dst_reg(c, inst, i);
530 src = get_src_reg_imm(c, inst, 0, i);
531 brw_MOV(p, dst, src);
532 }
533 }
534 brw_set_saturate(p, 0);
535 }
536
537 static void emit_pixel_xy(struct brw_wm_compile *c,
538 struct prog_instruction *inst)
539 {
540 struct brw_reg r1 = brw_vec1_grf(1, 0);
541 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
542
543 struct brw_reg dst0, dst1;
544 struct brw_compile *p = &c->func;
545 GLuint mask = inst->DstReg.WriteMask;
546
547 dst0 = get_dst_reg(c, inst, 0);
548 dst1 = get_dst_reg(c, inst, 1);
549 /* Calculate pixel centers by adding 1 or 0 to each of the
550 * micro-tile coordinates passed in r1.
551 */
552 if (mask & WRITEMASK_X) {
553 brw_ADD(p,
554 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
555 stride(suboffset(r1_uw, 4), 2, 4, 0),
556 brw_imm_v(0x10101010));
557 }
558
559 if (mask & WRITEMASK_Y) {
560 brw_ADD(p,
561 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
562 stride(suboffset(r1_uw, 5), 2, 4, 0),
563 brw_imm_v(0x11001100));
564 }
565 }
566
567 static void emit_delta_xy(struct brw_wm_compile *c,
568 struct prog_instruction *inst)
569 {
570 struct brw_reg r1 = brw_vec1_grf(1, 0);
571 struct brw_reg dst0, dst1, src0, src1;
572 struct brw_compile *p = &c->func;
573 GLuint mask = inst->DstReg.WriteMask;
574
575 dst0 = get_dst_reg(c, inst, 0);
576 dst1 = get_dst_reg(c, inst, 1);
577 src0 = get_src_reg(c, inst, 0, 0);
578 src1 = get_src_reg(c, inst, 0, 1);
579 /* Calc delta X,Y by subtracting origin in r1 from the pixel
580 * centers.
581 */
582 if (mask & WRITEMASK_X) {
583 brw_ADD(p,
584 dst0,
585 retype(src0, BRW_REGISTER_TYPE_UW),
586 negate(r1));
587 }
588
589 if (mask & WRITEMASK_Y) {
590 brw_ADD(p,
591 dst1,
592 retype(src1, BRW_REGISTER_TYPE_UW),
593 negate(suboffset(r1,1)));
594
595 }
596 }
597
598 static void fire_fb_write( struct brw_wm_compile *c,
599 GLuint base_reg,
600 GLuint nr,
601 GLuint target,
602 GLuint eot)
603 {
604 struct brw_compile *p = &c->func;
605 /* Pass through control information:
606 */
607 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
608 {
609 brw_push_insn_state(p);
610 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
611 brw_MOV(p,
612 brw_message_reg(base_reg + 1),
613 brw_vec8_grf(1, 0));
614 brw_pop_insn_state(p);
615 }
616 /* Send framebuffer write message: */
617 brw_fb_WRITE(p,
618 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
619 base_reg,
620 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
621 target,
622 nr,
623 0,
624 eot);
625 }
626
627 static void emit_fb_write(struct brw_wm_compile *c,
628 struct prog_instruction *inst)
629 {
630 struct brw_compile *p = &c->func;
631 int nr = 2;
632 int channel;
633 GLuint target, eot;
634 struct brw_reg src0;
635
636 /* Reserve a space for AA - may not be needed:
637 */
638 if (c->key.aa_dest_stencil_reg)
639 nr += 1;
640
641 brw_push_insn_state(p);
642 for (channel = 0; channel < 4; channel++) {
643 src0 = get_src_reg(c, inst, 0, channel);
644 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
645 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
646 brw_MOV(p, brw_message_reg(nr + channel), src0);
647 }
648 /* skip over the regs populated above: */
649 nr += 8;
650 brw_pop_insn_state(p);
651
652 if (c->key.source_depth_to_render_target) {
653 if (c->key.computes_depth) {
654 src0 = get_src_reg(c, inst, 2, 2);
655 brw_MOV(p, brw_message_reg(nr), src0);
656 }
657 else {
658 src0 = get_src_reg(c, inst, 1, 1);
659 brw_MOV(p, brw_message_reg(nr), src0);
660 }
661
662 nr += 2;
663 }
664
665 if (c->key.dest_depth_reg) {
666 GLuint comp = c->key.dest_depth_reg / 2;
667 GLuint off = c->key.dest_depth_reg % 2;
668
669 assert(comp == 1);
670 assert(off == 0);
671 #if 0
672 /* XXX do we need this code? comp always 1, off always 0, it seems */
673 if (off != 0) {
674 brw_push_insn_state(p);
675 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
676
677 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
678 /* 2nd half? */
679 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
680 brw_pop_insn_state(p);
681 }
682 else
683 #endif
684 {
685 struct brw_reg src = get_src_reg(c, inst, 1, 1);
686 brw_MOV(p, brw_message_reg(nr), src);
687 }
688 nr += 2;
689 }
690
691 target = inst->Aux >> 1;
692 eot = inst->Aux & 1;
693 fire_fb_write(c, 0, nr, target, eot);
694 }
695
696 static void emit_pixel_w( struct brw_wm_compile *c,
697 struct prog_instruction *inst)
698 {
699 struct brw_compile *p = &c->func;
700 GLuint mask = inst->DstReg.WriteMask;
701 if (mask & WRITEMASK_W) {
702 struct brw_reg dst, src0, delta0, delta1;
703 struct brw_reg interp3;
704
705 dst = get_dst_reg(c, inst, 3);
706 src0 = get_src_reg(c, inst, 0, 0);
707 delta0 = get_src_reg(c, inst, 1, 0);
708 delta1 = get_src_reg(c, inst, 1, 1);
709
710 interp3 = brw_vec1_grf(src0.nr+1, 4);
711 /* Calc 1/w - just linterp wpos[3] optimized by putting the
712 * result straight into a message reg.
713 */
714 brw_LINE(p, brw_null_reg(), interp3, delta0);
715 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
716
717 /* Calc w */
718 brw_math_16( p, dst,
719 BRW_MATH_FUNCTION_INV,
720 BRW_MATH_SATURATE_NONE,
721 2, brw_null_reg(),
722 BRW_MATH_PRECISION_FULL);
723 }
724 }
725
726 static void emit_linterp(struct brw_wm_compile *c,
727 struct prog_instruction *inst)
728 {
729 struct brw_compile *p = &c->func;
730 GLuint mask = inst->DstReg.WriteMask;
731 struct brw_reg interp[4];
732 struct brw_reg dst, delta0, delta1;
733 struct brw_reg src0;
734 GLuint nr, i;
735
736 src0 = get_src_reg(c, inst, 0, 0);
737 delta0 = get_src_reg(c, inst, 1, 0);
738 delta1 = get_src_reg(c, inst, 1, 1);
739 nr = src0.nr;
740
741 interp[0] = brw_vec1_grf(nr, 0);
742 interp[1] = brw_vec1_grf(nr, 4);
743 interp[2] = brw_vec1_grf(nr+1, 0);
744 interp[3] = brw_vec1_grf(nr+1, 4);
745
746 for(i = 0; i < 4; i++ ) {
747 if (mask & (1<<i)) {
748 dst = get_dst_reg(c, inst, i);
749 brw_LINE(p, brw_null_reg(), interp[i], delta0);
750 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
751 }
752 }
753 }
754
755 static void emit_cinterp(struct brw_wm_compile *c,
756 struct prog_instruction *inst)
757 {
758 struct brw_compile *p = &c->func;
759 GLuint mask = inst->DstReg.WriteMask;
760
761 struct brw_reg interp[4];
762 struct brw_reg dst, src0;
763 GLuint nr, i;
764
765 src0 = get_src_reg(c, inst, 0, 0);
766 nr = src0.nr;
767
768 interp[0] = brw_vec1_grf(nr, 0);
769 interp[1] = brw_vec1_grf(nr, 4);
770 interp[2] = brw_vec1_grf(nr+1, 0);
771 interp[3] = brw_vec1_grf(nr+1, 4);
772
773 for(i = 0; i < 4; i++ ) {
774 if (mask & (1<<i)) {
775 dst = get_dst_reg(c, inst, i);
776 brw_MOV(p, dst, suboffset(interp[i],3));
777 }
778 }
779 }
780
781 static void emit_pinterp(struct brw_wm_compile *c,
782 struct prog_instruction *inst)
783 {
784 struct brw_compile *p = &c->func;
785 GLuint mask = inst->DstReg.WriteMask;
786
787 struct brw_reg interp[4];
788 struct brw_reg dst, delta0, delta1;
789 struct brw_reg src0, w;
790 GLuint nr, i;
791
792 src0 = get_src_reg(c, inst, 0, 0);
793 delta0 = get_src_reg(c, inst, 1, 0);
794 delta1 = get_src_reg(c, inst, 1, 1);
795 w = get_src_reg(c, inst, 2, 3);
796 nr = src0.nr;
797
798 interp[0] = brw_vec1_grf(nr, 0);
799 interp[1] = brw_vec1_grf(nr, 4);
800 interp[2] = brw_vec1_grf(nr+1, 0);
801 interp[3] = brw_vec1_grf(nr+1, 4);
802
803 for(i = 0; i < 4; i++ ) {
804 if (mask & (1<<i)) {
805 dst = get_dst_reg(c, inst, i);
806 brw_LINE(p, brw_null_reg(), interp[i], delta0);
807 brw_MAC(p, dst, suboffset(interp[i],1),
808 delta1);
809 brw_MUL(p, dst, dst, w);
810 }
811 }
812 }
813
814 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
815 static void emit_frontfacing(struct brw_wm_compile *c,
816 struct prog_instruction *inst)
817 {
818 struct brw_compile *p = &c->func;
819 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
820 struct brw_reg dst;
821 GLuint mask = inst->DstReg.WriteMask;
822 int i;
823
824 for (i = 0; i < 4; i++) {
825 if (mask & (1<<i)) {
826 dst = get_dst_reg(c, inst, i);
827 brw_MOV(p, dst, brw_imm_f(0.0));
828 }
829 }
830
831 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
832 * us front face
833 */
834 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
835 for (i = 0; i < 4; i++) {
836 if (mask & (1<<i)) {
837 dst = get_dst_reg(c, inst, i);
838 brw_MOV(p, dst, brw_imm_f(1.0));
839 }
840 }
841 brw_set_predicate_control_flag_value(p, 0xff);
842 }
843
844 static void emit_xpd(struct brw_wm_compile *c,
845 struct prog_instruction *inst)
846 {
847 int i;
848 struct brw_compile *p = &c->func;
849 GLuint mask = inst->DstReg.WriteMask;
850 for (i = 0; i < 4; i++) {
851 GLuint i2 = (i+2)%3;
852 GLuint i1 = (i+1)%3;
853 if (mask & (1<<i)) {
854 struct brw_reg src0, src1, dst;
855 dst = get_dst_reg(c, inst, i);
856 src0 = negate(get_src_reg(c, inst, 0, i2));
857 src1 = get_src_reg_imm(c, inst, 1, i1);
858 brw_MUL(p, brw_null_reg(), src0, src1);
859 src0 = get_src_reg(c, inst, 0, i1);
860 src1 = get_src_reg_imm(c, inst, 1, i2);
861 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
862 brw_MAC(p, dst, src0, src1);
863 brw_set_saturate(p, 0);
864 }
865 }
866 brw_set_saturate(p, 0);
867 }
868
869 static void emit_dp3(struct brw_wm_compile *c,
870 struct prog_instruction *inst)
871 {
872 struct brw_reg src0[3], src1[3], dst;
873 int i;
874 struct brw_compile *p = &c->func;
875 for (i = 0; i < 3; i++) {
876 src0[i] = get_src_reg(c, inst, 0, i);
877 src1[i] = get_src_reg_imm(c, inst, 1, i);
878 }
879
880 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
881 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
882 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
883 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
884 brw_MAC(p, dst, src0[2], src1[2]);
885 brw_set_saturate(p, 0);
886 }
887
888 static void emit_dp4(struct brw_wm_compile *c,
889 struct prog_instruction *inst)
890 {
891 struct brw_reg src0[4], src1[4], dst;
892 int i;
893 struct brw_compile *p = &c->func;
894 for (i = 0; i < 4; i++) {
895 src0[i] = get_src_reg(c, inst, 0, i);
896 src1[i] = get_src_reg_imm(c, inst, 1, i);
897 }
898 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
899 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
900 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
901 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
902 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
903 brw_MAC(p, dst, src0[3], src1[3]);
904 brw_set_saturate(p, 0);
905 }
906
907 static void emit_dph(struct brw_wm_compile *c,
908 struct prog_instruction *inst)
909 {
910 struct brw_reg src0[4], src1[4], dst;
911 int i;
912 struct brw_compile *p = &c->func;
913 for (i = 0; i < 4; i++) {
914 src0[i] = get_src_reg(c, inst, 0, i);
915 src1[i] = get_src_reg_imm(c, inst, 1, i);
916 }
917 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
918 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
919 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
920 brw_MAC(p, dst, src0[2], src1[2]);
921 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
922 brw_ADD(p, dst, dst, src1[3]);
923 brw_set_saturate(p, 0);
924 }
925
926 /**
927 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
928 * Note that the result of the function is smeared across the dest
929 * register's X, Y, Z and W channels (subject to writemasking of course).
930 */
931 static void emit_math1(struct brw_wm_compile *c,
932 struct prog_instruction *inst, GLuint func)
933 {
934 struct brw_compile *p = &c->func;
935 struct brw_reg src0, dst, tmp;
936 const int mark = mark_tmps( c );
937 int i;
938
939 tmp = alloc_tmp(c);
940
941 /* Get first component of source register */
942 src0 = get_src_reg(c, inst, 0, 0);
943
944 /* tmp = func(src0) */
945 brw_MOV(p, brw_message_reg(2), src0);
946 brw_math(p,
947 tmp,
948 func,
949 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
950 2,
951 brw_null_reg(),
952 BRW_MATH_DATA_VECTOR,
953 BRW_MATH_PRECISION_FULL);
954
955 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
956
957 /* replicate tmp value across enabled dest channels */
958 for (i = 0; i < 4; i++) {
959 if (inst->DstReg.WriteMask & (1 << i)) {
960 dst = get_dst_reg(c, inst, i);
961 brw_MOV(p, dst, tmp);
962 }
963 }
964
965 release_tmps(c, mark);
966 }
967
968 static void emit_rcp(struct brw_wm_compile *c,
969 struct prog_instruction *inst)
970 {
971 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
972 }
973
974 static void emit_rsq(struct brw_wm_compile *c,
975 struct prog_instruction *inst)
976 {
977 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
978 }
979
980 static void emit_sin(struct brw_wm_compile *c,
981 struct prog_instruction *inst)
982 {
983 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
984 }
985
986 static void emit_cos(struct brw_wm_compile *c,
987 struct prog_instruction *inst)
988 {
989 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
990 }
991
992 static void emit_ex2(struct brw_wm_compile *c,
993 struct prog_instruction *inst)
994 {
995 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
996 }
997
998 static void emit_lg2(struct brw_wm_compile *c,
999 struct prog_instruction *inst)
1000 {
1001 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1002 }
1003
1004 static void emit_add(struct brw_wm_compile *c,
1005 struct prog_instruction *inst)
1006 {
1007 struct brw_compile *p = &c->func;
1008 struct brw_reg src0, src1, dst;
1009 GLuint mask = inst->DstReg.WriteMask;
1010 int i;
1011 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1012 for (i = 0 ; i < 4; i++) {
1013 if (mask & (1<<i)) {
1014 dst = get_dst_reg(c, inst, i);
1015 src0 = get_src_reg(c, inst, 0, i);
1016 src1 = get_src_reg_imm(c, inst, 1, i);
1017 brw_ADD(p, dst, src0, src1);
1018 }
1019 }
1020 brw_set_saturate(p, 0);
1021 }
1022
1023 static void emit_arl(struct brw_wm_compile *c,
1024 struct prog_instruction *inst)
1025 {
1026 struct brw_compile *p = &c->func;
1027 struct brw_reg src0, addr_reg;
1028 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1029 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1030 BRW_ARF_ADDRESS, 0);
1031 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1032 brw_MOV(p, addr_reg, src0);
1033 brw_set_saturate(p, 0);
1034 }
1035
1036 static void emit_sub(struct brw_wm_compile *c,
1037 struct prog_instruction *inst)
1038 {
1039 struct brw_compile *p = &c->func;
1040 struct brw_reg src0, src1, dst;
1041 GLuint mask = inst->DstReg.WriteMask;
1042 int i;
1043 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1044 for (i = 0 ; i < 4; i++) {
1045 if (mask & (1<<i)) {
1046 dst = get_dst_reg(c, inst, i);
1047 src0 = get_src_reg(c, inst, 0, i);
1048 src1 = get_src_reg_imm(c, inst, 1, i);
1049 brw_ADD(p, dst, src0, negate(src1));
1050 }
1051 }
1052 brw_set_saturate(p, 0);
1053 }
1054
1055 static void emit_mul(struct brw_wm_compile *c,
1056 struct prog_instruction *inst)
1057 {
1058 struct brw_compile *p = &c->func;
1059 struct brw_reg src0, src1, dst;
1060 GLuint mask = inst->DstReg.WriteMask;
1061 int i;
1062 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1063 for (i = 0 ; i < 4; i++) {
1064 if (mask & (1<<i)) {
1065 dst = get_dst_reg(c, inst, i);
1066 src0 = get_src_reg(c, inst, 0, i);
1067 src1 = get_src_reg_imm(c, inst, 1, i);
1068 brw_MUL(p, dst, src0, src1);
1069 }
1070 }
1071 brw_set_saturate(p, 0);
1072 }
1073
1074 static void emit_frc(struct brw_wm_compile *c,
1075 struct prog_instruction *inst)
1076 {
1077 struct brw_compile *p = &c->func;
1078 struct brw_reg src0, dst;
1079 GLuint mask = inst->DstReg.WriteMask;
1080 int i;
1081 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1082 for (i = 0 ; i < 4; i++) {
1083 if (mask & (1<<i)) {
1084 dst = get_dst_reg(c, inst, i);
1085 src0 = get_src_reg_imm(c, inst, 0, i);
1086 brw_FRC(p, dst, src0);
1087 }
1088 }
1089 if (inst->SaturateMode != SATURATE_OFF)
1090 brw_set_saturate(p, 0);
1091 }
1092
1093 static void emit_flr(struct brw_wm_compile *c,
1094 struct prog_instruction *inst)
1095 {
1096 struct brw_compile *p = &c->func;
1097 struct brw_reg src0, dst;
1098 GLuint mask = inst->DstReg.WriteMask;
1099 int i;
1100 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1101 for (i = 0 ; i < 4; i++) {
1102 if (mask & (1<<i)) {
1103 dst = get_dst_reg(c, inst, i);
1104 src0 = get_src_reg_imm(c, inst, 0, i);
1105 brw_RNDD(p, dst, src0);
1106 }
1107 }
1108 brw_set_saturate(p, 0);
1109 }
1110
1111
1112 static void emit_min_max(struct brw_wm_compile *c,
1113 const struct prog_instruction *inst)
1114 {
1115 struct brw_compile *p = &c->func;
1116 const GLuint mask = inst->DstReg.WriteMask;
1117 const int mark = mark_tmps(c);
1118 int i;
1119 brw_push_insn_state(p);
1120 for (i = 0; i < 4; i++) {
1121 if (mask & (1<<i)) {
1122 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1123 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1124 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1125 struct brw_reg dst;
1126 /* if dst==src0 or dst==src1 we need to use a temp reg */
1127 GLboolean use_temp = brw_same_reg(dst, src0) ||
1128 brw_same_reg(dst, src1);
1129 if (use_temp)
1130 dst = alloc_tmp(c);
1131 else
1132 dst = real_dst;
1133
1134 /*
1135 printf(" Min/max: dst %d src0 %d src1 %d\n",
1136 dst.nr, src0.nr, src1.nr);
1137 */
1138 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1139 brw_MOV(p, dst, src0);
1140 brw_set_saturate(p, 0);
1141
1142 if (inst->Opcode == OPCODE_MIN)
1143 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1144 else
1145 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1146
1147 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1148 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1149 brw_MOV(p, dst, src1);
1150 brw_set_saturate(p, 0);
1151 brw_set_predicate_control_flag_value(p, 0xff);
1152 if (use_temp)
1153 brw_MOV(p, real_dst, dst);
1154 }
1155 }
1156 brw_pop_insn_state(p);
1157 release_tmps(c, mark);
1158 }
1159
1160 static void emit_pow(struct brw_wm_compile *c,
1161 struct prog_instruction *inst)
1162 {
1163 struct brw_compile *p = &c->func;
1164 struct brw_reg dst, src0, src1;
1165 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1166 src0 = get_src_reg_imm(c, inst, 0, 0);
1167 src1 = get_src_reg_imm(c, inst, 1, 0);
1168
1169 brw_MOV(p, brw_message_reg(2), src0);
1170 brw_MOV(p, brw_message_reg(3), src1);
1171
1172 brw_math(p,
1173 dst,
1174 BRW_MATH_FUNCTION_POW,
1175 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1176 2,
1177 brw_null_reg(),
1178 BRW_MATH_DATA_VECTOR,
1179 BRW_MATH_PRECISION_FULL);
1180 }
1181
1182 static void emit_lrp(struct brw_wm_compile *c,
1183 struct prog_instruction *inst)
1184 {
1185 struct brw_compile *p = &c->func;
1186 GLuint mask = inst->DstReg.WriteMask;
1187 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1188 int i;
1189 int mark = mark_tmps(c);
1190 for (i = 0; i < 4; i++) {
1191 if (mask & (1<<i)) {
1192 dst = get_dst_reg(c, inst, i);
1193 src0 = get_src_reg(c, inst, 0, i);
1194
1195 src1 = get_src_reg_imm(c, inst, 1, i);
1196
1197 if (src1.nr == dst.nr) {
1198 tmp1 = alloc_tmp(c);
1199 brw_MOV(p, tmp1, src1);
1200 } else
1201 tmp1 = src1;
1202
1203 src2 = get_src_reg(c, inst, 2, i);
1204 if (src2.nr == dst.nr) {
1205 tmp2 = alloc_tmp(c);
1206 brw_MOV(p, tmp2, src2);
1207 } else
1208 tmp2 = src2;
1209
1210 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1211 brw_MUL(p, brw_null_reg(), dst, tmp2);
1212 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1213 brw_MAC(p, dst, src0, tmp1);
1214 brw_set_saturate(p, 0);
1215 }
1216 release_tmps(c, mark);
1217 }
1218 }
1219
1220 /**
1221 * For GLSL shaders, this KIL will be unconditional.
1222 * It may be contained inside an IF/ENDIF structure of course.
1223 */
1224 static void emit_kil(struct brw_wm_compile *c)
1225 {
1226 struct brw_compile *p = &c->func;
1227 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1228 brw_push_insn_state(p);
1229 brw_set_mask_control(p, BRW_MASK_DISABLE);
1230 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1231 brw_AND(p, depth, c->emit_mask_reg, depth);
1232 brw_pop_insn_state(p);
1233 }
1234
1235 static void emit_mad(struct brw_wm_compile *c,
1236 struct prog_instruction *inst)
1237 {
1238 struct brw_compile *p = &c->func;
1239 GLuint mask = inst->DstReg.WriteMask;
1240 struct brw_reg dst, src0, src1, src2;
1241 int i;
1242
1243 for (i = 0; i < 4; i++) {
1244 if (mask & (1<<i)) {
1245 dst = get_dst_reg(c, inst, i);
1246 src0 = get_src_reg(c, inst, 0, i);
1247 src1 = get_src_reg_imm(c, inst, 1, i);
1248 src2 = get_src_reg_imm(c, inst, 2, i);
1249 brw_MUL(p, dst, src0, src1);
1250
1251 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1252 brw_ADD(p, dst, dst, src2);
1253 brw_set_saturate(p, 0);
1254 }
1255 }
1256 }
1257
1258 static void emit_sop(struct brw_wm_compile *c,
1259 struct prog_instruction *inst, GLuint cond)
1260 {
1261 struct brw_compile *p = &c->func;
1262 GLuint mask = inst->DstReg.WriteMask;
1263 struct brw_reg dst, src0, src1;
1264 int i;
1265
1266 for (i = 0; i < 4; i++) {
1267 if (mask & (1<<i)) {
1268 dst = get_dst_reg(c, inst, i);
1269 src0 = get_src_reg(c, inst, 0, i);
1270 src1 = get_src_reg_imm(c, inst, 1, i);
1271 brw_push_insn_state(p);
1272 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1273 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1274 brw_MOV(p, dst, brw_imm_f(0.0));
1275 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1276 brw_MOV(p, dst, brw_imm_f(1.0));
1277 brw_pop_insn_state(p);
1278 }
1279 }
1280 }
1281
1282 static void emit_slt(struct brw_wm_compile *c,
1283 struct prog_instruction *inst)
1284 {
1285 emit_sop(c, inst, BRW_CONDITIONAL_L);
1286 }
1287
1288 static void emit_sle(struct brw_wm_compile *c,
1289 struct prog_instruction *inst)
1290 {
1291 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1292 }
1293
1294 static void emit_sgt(struct brw_wm_compile *c,
1295 struct prog_instruction *inst)
1296 {
1297 emit_sop(c, inst, BRW_CONDITIONAL_G);
1298 }
1299
1300 static void emit_sge(struct brw_wm_compile *c,
1301 struct prog_instruction *inst)
1302 {
1303 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1304 }
1305
1306 static void emit_seq(struct brw_wm_compile *c,
1307 struct prog_instruction *inst)
1308 {
1309 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1310 }
1311
1312 static void emit_sne(struct brw_wm_compile *c,
1313 struct prog_instruction *inst)
1314 {
1315 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1316 }
1317
1318 static void emit_ddx(struct brw_wm_compile *c,
1319 struct prog_instruction *inst)
1320 {
1321 struct brw_compile *p = &c->func;
1322 GLuint mask = inst->DstReg.WriteMask;
1323 struct brw_reg interp[4];
1324 struct brw_reg dst;
1325 struct brw_reg src0, w;
1326 GLuint nr, i;
1327 src0 = get_src_reg(c, inst, 0, 0);
1328 w = get_src_reg(c, inst, 1, 3);
1329 nr = src0.nr;
1330 interp[0] = brw_vec1_grf(nr, 0);
1331 interp[1] = brw_vec1_grf(nr, 4);
1332 interp[2] = brw_vec1_grf(nr+1, 0);
1333 interp[3] = brw_vec1_grf(nr+1, 4);
1334 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1335 for(i = 0; i < 4; i++ ) {
1336 if (mask & (1<<i)) {
1337 dst = get_dst_reg(c, inst, i);
1338 brw_MOV(p, dst, interp[i]);
1339 brw_MUL(p, dst, dst, w);
1340 }
1341 }
1342 brw_set_saturate(p, 0);
1343 }
1344
1345 static void emit_ddy(struct brw_wm_compile *c,
1346 struct prog_instruction *inst)
1347 {
1348 struct brw_compile *p = &c->func;
1349 GLuint mask = inst->DstReg.WriteMask;
1350 struct brw_reg interp[4];
1351 struct brw_reg dst;
1352 struct brw_reg src0, w;
1353 GLuint nr, i;
1354
1355 src0 = get_src_reg(c, inst, 0, 0);
1356 nr = src0.nr;
1357 w = get_src_reg(c, inst, 1, 3);
1358 interp[0] = brw_vec1_grf(nr, 0);
1359 interp[1] = brw_vec1_grf(nr, 4);
1360 interp[2] = brw_vec1_grf(nr+1, 0);
1361 interp[3] = brw_vec1_grf(nr+1, 4);
1362 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1363 for(i = 0; i < 4; i++ ) {
1364 if (mask & (1<<i)) {
1365 dst = get_dst_reg(c, inst, i);
1366 brw_MOV(p, dst, suboffset(interp[i], 1));
1367 brw_MUL(p, dst, dst, w);
1368 }
1369 }
1370 brw_set_saturate(p, 0);
1371 }
1372
1373 static INLINE struct brw_reg high_words( struct brw_reg reg )
1374 {
1375 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1376 0, 8, 2 );
1377 }
1378
1379 static INLINE struct brw_reg low_words( struct brw_reg reg )
1380 {
1381 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1382 }
1383
1384 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1385 {
1386 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1387 }
1388
1389 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1390 {
1391 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1392 0, 16, 2 );
1393 }
1394
1395 /* One-, two- and three-dimensional Perlin noise, similar to the description
1396 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1397 static void noise1_sub( struct brw_wm_compile *c ) {
1398
1399 struct brw_compile *p = &c->func;
1400 struct brw_reg param,
1401 x0, x1, /* gradients at each end */
1402 t, tmp[ 2 ], /* float temporaries */
1403 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1404 int i;
1405 int mark = mark_tmps( c );
1406
1407 x0 = alloc_tmp( c );
1408 x1 = alloc_tmp( c );
1409 t = alloc_tmp( c );
1410 tmp[ 0 ] = alloc_tmp( c );
1411 tmp[ 1 ] = alloc_tmp( c );
1412 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1413 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1414 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1415 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1416 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1417
1418 param = lookup_tmp( c, mark - 2 );
1419
1420 brw_set_access_mode( p, BRW_ALIGN_1 );
1421
1422 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1423
1424 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1425 be hashed. Also compute the remainder (offset within the unit
1426 length), interleaved to reduce register dependency penalties. */
1427 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1428 brw_FRC( p, param, param );
1429 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1430 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1431 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1432
1433 /* We're now ready to perform the hashing. The two hashes are
1434 interleaved for performance. The hash function used is
1435 designed to rapidly achieve avalanche and require only 32x16
1436 bit multiplication, and 16-bit swizzles (which we get for
1437 free). We can't use immediate operands in the multiplies,
1438 because immediates are permitted only in src1 and the 16-bit
1439 factor is permitted only in src0. */
1440 for( i = 0; i < 2; i++ )
1441 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1442 for( i = 0; i < 2; i++ )
1443 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1444 high_words( itmp[ i ] ) );
1445 for( i = 0; i < 2; i++ )
1446 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1447 for( i = 0; i < 2; i++ )
1448 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1449 high_words( itmp[ i ] ) );
1450 for( i = 0; i < 2; i++ )
1451 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1452 for( i = 0; i < 2; i++ )
1453 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1454 high_words( itmp[ i ] ) );
1455
1456 /* Now we want to initialise the two gradients based on the
1457 hashes. Format conversion from signed integer to float leaves
1458 everything scaled too high by a factor of pow( 2, 31 ), but
1459 we correct for that right at the end. */
1460 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1461 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1462 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1463
1464 brw_MUL( p, x0, x0, param );
1465 brw_MUL( p, x1, x1, t );
1466
1467 /* We interpolate between the gradients using the polynomial
1468 6t^5 - 15t^4 + 10t^3 (Perlin). */
1469 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1470 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1471 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1472 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1473 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1474 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1475 pipeline */
1476 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1477 brw_MUL( p, param, tmp[ 0 ], param );
1478 brw_MUL( p, x1, x1, param );
1479 brw_ADD( p, x0, x0, x1 );
1480 /* scale by pow( 2, -30 ), to compensate for the format conversion
1481 above and an extra factor of 2 so that a single gradient covers
1482 the [-1,1] range */
1483 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1484
1485 release_tmps( c, mark );
1486 }
1487
1488 static void emit_noise1( struct brw_wm_compile *c,
1489 struct prog_instruction *inst )
1490 {
1491 struct brw_compile *p = &c->func;
1492 struct brw_reg src, param, dst;
1493 GLuint mask = inst->DstReg.WriteMask;
1494 int i;
1495 int mark = mark_tmps( c );
1496
1497 assert( mark == 0 );
1498
1499 src = get_src_reg( c, inst, 0, 0 );
1500
1501 param = alloc_tmp( c );
1502
1503 brw_MOV( p, param, src );
1504
1505 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1506
1507 /* Fill in the result: */
1508 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1509 for (i = 0 ; i < 4; i++) {
1510 if (mask & (1<<i)) {
1511 dst = get_dst_reg(c, inst, i);
1512 brw_MOV( p, dst, param );
1513 }
1514 }
1515 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1516 brw_set_saturate( p, 0 );
1517
1518 release_tmps( c, mark );
1519 }
1520
1521 static void noise2_sub( struct brw_wm_compile *c ) {
1522
1523 struct brw_compile *p = &c->func;
1524 struct brw_reg param0, param1,
1525 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1526 t, tmp[ 4 ], /* float temporaries */
1527 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1528 int i;
1529 int mark = mark_tmps( c );
1530
1531 x0y0 = alloc_tmp( c );
1532 x0y1 = alloc_tmp( c );
1533 x1y0 = alloc_tmp( c );
1534 x1y1 = alloc_tmp( c );
1535 t = alloc_tmp( c );
1536 for( i = 0; i < 4; i++ ) {
1537 tmp[ i ] = alloc_tmp( c );
1538 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1539 }
1540 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1541 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1542 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1543
1544 param0 = lookup_tmp( c, mark - 3 );
1545 param1 = lookup_tmp( c, mark - 2 );
1546
1547 brw_set_access_mode( p, BRW_ALIGN_1 );
1548
1549 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1550 be hashed. Also compute the remainders (offsets within the unit
1551 square), interleaved to reduce register dependency penalties. */
1552 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1553 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1554 brw_FRC( p, param0, param0 );
1555 brw_FRC( p, param1, param1 );
1556 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1557 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1558 low_words( itmp[ 1 ] ) );
1559 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1560 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1561 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1562 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1563 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1564
1565 /* We're now ready to perform the hashing. The four hashes are
1566 interleaved for performance. The hash function used is
1567 designed to rapidly achieve avalanche and require only 32x16
1568 bit multiplication, and 16-bit swizzles (which we get for
1569 free). We can't use immediate operands in the multiplies,
1570 because immediates are permitted only in src1 and the 16-bit
1571 factor is permitted only in src0. */
1572 for( i = 0; i < 4; i++ )
1573 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1574 for( i = 0; i < 4; i++ )
1575 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1576 high_words( itmp[ i ] ) );
1577 for( i = 0; i < 4; i++ )
1578 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1579 for( i = 0; i < 4; i++ )
1580 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1581 high_words( itmp[ i ] ) );
1582 for( i = 0; i < 4; i++ )
1583 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1584 for( i = 0; i < 4; i++ )
1585 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1586 high_words( itmp[ i ] ) );
1587
1588 /* Now we want to initialise the four gradients based on the
1589 hashes. Format conversion from signed integer to float leaves
1590 everything scaled too high by a factor of pow( 2, 15 ), but
1591 we correct for that right at the end. */
1592 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1593 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1594 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1595 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1596 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1597
1598 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1599 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1600 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1601 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1602
1603 brw_MUL( p, x1y0, x1y0, t );
1604 brw_MUL( p, x1y1, x1y1, t );
1605 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1606 brw_MUL( p, x0y0, x0y0, param0 );
1607 brw_MUL( p, x0y1, x0y1, param0 );
1608
1609 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1610 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1611 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1612 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1613
1614 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1615 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1616 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1617 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1618
1619 /* We interpolate between the gradients using the polynomial
1620 6t^5 - 15t^4 + 10t^3 (Perlin). */
1621 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1622 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1623 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1624 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1625 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1626 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1627 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1628 pipeline */
1629 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1630 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1631 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1632 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1633 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1634 pipeline */
1635 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1636 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1637 brw_MUL( p, param0, tmp[ 0 ], param0 );
1638 brw_MUL( p, param1, tmp[ 1 ], param1 );
1639
1640 /* Here we interpolate in the y dimension... */
1641 brw_MUL( p, x0y1, x0y1, param1 );
1642 brw_MUL( p, x1y1, x1y1, param1 );
1643 brw_ADD( p, x0y0, x0y0, x0y1 );
1644 brw_ADD( p, x1y0, x1y0, x1y1 );
1645
1646 /* And now in x. There are horrible register dependencies here,
1647 but we have nothing else to do. */
1648 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1649 brw_MUL( p, x1y0, x1y0, param0 );
1650 brw_ADD( p, x0y0, x0y0, x1y0 );
1651
1652 /* scale by pow( 2, -15 ), as described above */
1653 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1654
1655 release_tmps( c, mark );
1656 }
1657
1658 static void emit_noise2( struct brw_wm_compile *c,
1659 struct prog_instruction *inst )
1660 {
1661 struct brw_compile *p = &c->func;
1662 struct brw_reg src0, src1, param0, param1, dst;
1663 GLuint mask = inst->DstReg.WriteMask;
1664 int i;
1665 int mark = mark_tmps( c );
1666
1667 assert( mark == 0 );
1668
1669 src0 = get_src_reg( c, inst, 0, 0 );
1670 src1 = get_src_reg( c, inst, 0, 1 );
1671
1672 param0 = alloc_tmp( c );
1673 param1 = alloc_tmp( c );
1674
1675 brw_MOV( p, param0, src0 );
1676 brw_MOV( p, param1, src1 );
1677
1678 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1679
1680 /* Fill in the result: */
1681 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1682 for (i = 0 ; i < 4; i++) {
1683 if (mask & (1<<i)) {
1684 dst = get_dst_reg(c, inst, i);
1685 brw_MOV( p, dst, param0 );
1686 }
1687 }
1688 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1689 brw_set_saturate( p, 0 );
1690
1691 release_tmps( c, mark );
1692 }
1693
1694 /**
1695 * The three-dimensional case is much like the one- and two- versions above,
1696 * but since the number of corners is rapidly growing we now pack 16 16-bit
1697 * hashes into each register to extract more parallelism from the EUs.
1698 */
1699 static void noise3_sub( struct brw_wm_compile *c ) {
1700
1701 struct brw_compile *p = &c->func;
1702 struct brw_reg param0, param1, param2,
1703 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1704 xi, yi, zi, /* interpolation coefficients */
1705 t, tmp[ 8 ], /* float temporaries */
1706 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1707 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1708 int i;
1709 int mark = mark_tmps( c );
1710
1711 x0y0 = alloc_tmp( c );
1712 x0y1 = alloc_tmp( c );
1713 x1y0 = alloc_tmp( c );
1714 x1y1 = alloc_tmp( c );
1715 xi = alloc_tmp( c );
1716 yi = alloc_tmp( c );
1717 zi = alloc_tmp( c );
1718 t = alloc_tmp( c );
1719 for( i = 0; i < 8; i++ ) {
1720 tmp[ i ] = alloc_tmp( c );
1721 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1722 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1723 }
1724
1725 param0 = lookup_tmp( c, mark - 4 );
1726 param1 = lookup_tmp( c, mark - 3 );
1727 param2 = lookup_tmp( c, mark - 2 );
1728
1729 brw_set_access_mode( p, BRW_ALIGN_1 );
1730
1731 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1732 be hashed. Also compute the remainders (offsets within the unit
1733 cube), interleaved to reduce register dependency penalties. */
1734 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1735 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1736 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1737 brw_FRC( p, param0, param0 );
1738 brw_FRC( p, param1, param1 );
1739 brw_FRC( p, param2, param2 );
1740 /* Since we now have only 16 bits of precision in the hash, we must
1741 be more careful about thorough mixing to maintain entropy as we
1742 squash the input vector into a small scalar. */
1743 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1744 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1745 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1746 brw_imm_uw( 0x9B93 ) );
1747 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1748 brw_imm_uw( 0xBC8F ) );
1749
1750 /* Temporarily disable the execution mask while we work with ExecSize=16
1751 channels (the mask is set for ExecSize=8 and is probably incorrect).
1752 Although this might cause execution of unwanted channels, the code
1753 writes only to temporary registers and has no side effects, so
1754 disabling the mask is harmless. */
1755 brw_push_insn_state( p );
1756 brw_set_mask_control( p, BRW_MASK_DISABLE );
1757 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1758 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1759 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1760
1761 /* We're now ready to perform the hashing. The eight hashes are
1762 interleaved for performance. The hash function used is
1763 designed to rapidly achieve avalanche and require only 16x16
1764 bit multiplication, and 8-bit swizzles (which we get for
1765 free). */
1766 for( i = 0; i < 4; i++ )
1767 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1768 for( i = 0; i < 4; i++ )
1769 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1770 odd_bytes( wtmp[ i ] ) );
1771 for( i = 0; i < 4; i++ )
1772 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1773 for( i = 0; i < 4; i++ )
1774 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1775 odd_bytes( wtmp[ i ] ) );
1776 brw_pop_insn_state( p );
1777
1778 /* Now we want to initialise the four rear gradients based on the
1779 hashes. Format conversion from signed integer to float leaves
1780 everything scaled too high by a factor of pow( 2, 15 ), but
1781 we correct for that right at the end. */
1782 /* x component */
1783 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1784 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1785 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1786 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1787 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1788
1789 brw_push_insn_state( p );
1790 brw_set_mask_control( p, BRW_MASK_DISABLE );
1791 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1792 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1793 brw_pop_insn_state( p );
1794
1795 brw_MUL( p, x1y0, x1y0, t );
1796 brw_MUL( p, x1y1, x1y1, t );
1797 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1798 brw_MUL( p, x0y0, x0y0, param0 );
1799 brw_MUL( p, x0y1, x0y1, param0 );
1800
1801 /* y component */
1802 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1803 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1804 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1805 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1806
1807 brw_push_insn_state( p );
1808 brw_set_mask_control( p, BRW_MASK_DISABLE );
1809 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1810 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1811 brw_pop_insn_state( p );
1812
1813 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1814 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1815 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1816 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1817 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1818
1819 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1820 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1821 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1822 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1823
1824 /* z component */
1825 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1826 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1827 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1828 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1829
1830 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1831 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1832 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1833 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1834
1835 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1836 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1837 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1838 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1839
1840 /* We interpolate between the gradients using the polynomial
1841 6t^5 - 15t^4 + 10t^3 (Perlin). */
1842 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1843 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1844 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1845 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1846 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1847 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1848 brw_MUL( p, xi, xi, param0 );
1849 brw_MUL( p, yi, yi, param1 );
1850 brw_MUL( p, zi, zi, param2 );
1851 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1852 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1853 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1854 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1855 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1856 brw_MUL( p, xi, xi, param0 );
1857 brw_MUL( p, yi, yi, param1 );
1858 brw_MUL( p, zi, zi, param2 );
1859 brw_MUL( p, xi, xi, param0 );
1860 brw_MUL( p, yi, yi, param1 );
1861 brw_MUL( p, zi, zi, param2 );
1862 brw_MUL( p, xi, xi, param0 );
1863 brw_MUL( p, yi, yi, param1 );
1864 brw_MUL( p, zi, zi, param2 );
1865
1866 /* Here we interpolate in the y dimension... */
1867 brw_MUL( p, x0y1, x0y1, yi );
1868 brw_MUL( p, x1y1, x1y1, yi );
1869 brw_ADD( p, x0y0, x0y0, x0y1 );
1870 brw_ADD( p, x1y0, x1y0, x1y1 );
1871
1872 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1873 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1874 brw_MUL( p, x1y0, x1y0, xi );
1875 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1876
1877 /* Now do the same thing for the front four gradients... */
1878 /* x component */
1879 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1880 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1881 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1882 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1883
1884 brw_push_insn_state( p );
1885 brw_set_mask_control( p, BRW_MASK_DISABLE );
1886 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1887 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1888 brw_pop_insn_state( p );
1889
1890 brw_MUL( p, x1y0, x1y0, t );
1891 brw_MUL( p, x1y1, x1y1, t );
1892 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1893 brw_MUL( p, x0y0, x0y0, param0 );
1894 brw_MUL( p, x0y1, x0y1, param0 );
1895
1896 /* y component */
1897 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1898 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1899 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1900 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1901
1902 brw_push_insn_state( p );
1903 brw_set_mask_control( p, BRW_MASK_DISABLE );
1904 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1905 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1906 brw_pop_insn_state( p );
1907
1908 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1909 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1910 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1911 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1912 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1913
1914 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1915 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1916 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1917 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1918
1919 /* z component */
1920 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1921 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1922 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1923 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1924
1925 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1926 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1927 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1928 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1929
1930 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1931 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1932 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1933 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1934
1935 /* The interpolation coefficients are still around from last time, so
1936 again interpolate in the y dimension... */
1937 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1938 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1939 brw_MUL( p, x0y1, x0y1, yi );
1940 brw_MUL( p, x1y1, x1y1, yi );
1941 brw_ADD( p, x0y0, x0y0, x0y1 );
1942 brw_ADD( p, x1y0, x1y0, x1y1 );
1943
1944 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1945 time put the front face in tmp[ 1 ] and we're nearly there... */
1946 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1947 brw_MUL( p, x1y0, x1y0, xi );
1948 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1949
1950 /* The final interpolation, in the z dimension: */
1951 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1952 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1953 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1954
1955 /* scale by pow( 2, -15 ), as described above */
1956 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1957
1958 release_tmps( c, mark );
1959 }
1960
1961 static void emit_noise3( struct brw_wm_compile *c,
1962 struct prog_instruction *inst )
1963 {
1964 struct brw_compile *p = &c->func;
1965 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1966 GLuint mask = inst->DstReg.WriteMask;
1967 int i;
1968 int mark = mark_tmps( c );
1969
1970 assert( mark == 0 );
1971
1972 src0 = get_src_reg( c, inst, 0, 0 );
1973 src1 = get_src_reg( c, inst, 0, 1 );
1974 src2 = get_src_reg( c, inst, 0, 2 );
1975
1976 param0 = alloc_tmp( c );
1977 param1 = alloc_tmp( c );
1978 param2 = alloc_tmp( c );
1979
1980 brw_MOV( p, param0, src0 );
1981 brw_MOV( p, param1, src1 );
1982 brw_MOV( p, param2, src2 );
1983
1984 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1985
1986 /* Fill in the result: */
1987 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1988 for (i = 0 ; i < 4; i++) {
1989 if (mask & (1<<i)) {
1990 dst = get_dst_reg(c, inst, i);
1991 brw_MOV( p, dst, param0 );
1992 }
1993 }
1994 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1995 brw_set_saturate( p, 0 );
1996
1997 release_tmps( c, mark );
1998 }
1999
2000 /**
2001 * For the four-dimensional case, the little micro-optimisation benefits
2002 * we obtain by unrolling all the loops aren't worth the massive bloat it
2003 * now causes. Instead, we loop twice around performing a similar operation
2004 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2005 * code to glue it all together.
2006 */
2007 static void noise4_sub( struct brw_wm_compile *c )
2008 {
2009 struct brw_compile *p = &c->func;
2010 struct brw_reg param[ 4 ],
2011 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2012 w0, /* noise for the w=0 cube */
2013 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2014 interp[ 4 ], /* interpolation coefficients */
2015 t, tmp[ 8 ], /* float temporaries */
2016 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2017 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2018 int i, j;
2019 int mark = mark_tmps( c );
2020 GLuint loop, origin;
2021
2022 x0y0 = alloc_tmp( c );
2023 x0y1 = alloc_tmp( c );
2024 x1y0 = alloc_tmp( c );
2025 x1y1 = alloc_tmp( c );
2026 t = alloc_tmp( c );
2027 w0 = alloc_tmp( c );
2028 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2029 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2030
2031 for( i = 0; i < 4; i++ ) {
2032 param[ i ] = lookup_tmp( c, mark - 5 + i );
2033 interp[ i ] = alloc_tmp( c );
2034 }
2035
2036 for( i = 0; i < 8; i++ ) {
2037 tmp[ i ] = alloc_tmp( c );
2038 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2039 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2040 }
2041
2042 brw_set_access_mode( p, BRW_ALIGN_1 );
2043
2044 /* We only want 16 bits of precision from the integral part of each
2045 co-ordinate, but unfortunately the RNDD semantics would saturate
2046 at 16 bits if we performed the operation directly to a 16-bit
2047 destination. Therefore, we round to 32-bit temporaries where
2048 appropriate, and then store only the lower 16 bits. */
2049 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2050 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2051 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2052 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2053 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2054 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2055
2056 /* Modify the flag register here, because the side effect is useful
2057 later (see below). We know for certain that all flags will be
2058 cleared, since the FRC instruction cannot possibly generate
2059 negative results. Even for exceptional inputs (infinities, denormals,
2060 NaNs), the architecture guarantees that the L conditional is false. */
2061 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2062 brw_FRC( p, param[ 0 ], param[ 0 ] );
2063 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2064 for( i = 1; i < 4; i++ )
2065 brw_FRC( p, param[ i ], param[ i ] );
2066
2067 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2068 of all. */
2069 for( i = 0; i < 4; i++ )
2070 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2071 for( i = 0; i < 4; i++ )
2072 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2073 for( i = 0; i < 4; i++ )
2074 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2075 for( i = 0; i < 4; i++ )
2076 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2077 for( j = 0; j < 3; j++ )
2078 for( i = 0; i < 4; i++ )
2079 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2080
2081 /* Mark the current address, as it will be a jump destination. The
2082 following code will be executed twice: first, with the flag
2083 register clear indicating the w=0 case, and second with flags
2084 set for w=1. */
2085 loop = p->nr_insn;
2086
2087 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2088 be hashed. Since we have only 16 bits of precision in the hash, we
2089 must be careful about thorough mixing to maintain entropy as we
2090 squash the input vector into a small scalar. */
2091 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2092 brw_imm_uw( 0xBC8F ) );
2093 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2094 brw_imm_uw( 0xD0BD ) );
2095 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2096 brw_imm_uw( 0x9B93 ) );
2097 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2098 brw_imm_uw( 0xA359 ) );
2099 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2100 brw_imm_uw( 0xBC8F ) );
2101
2102 /* Temporarily disable the execution mask while we work with ExecSize=16
2103 channels (the mask is set for ExecSize=8 and is probably incorrect).
2104 Although this might cause execution of unwanted channels, the code
2105 writes only to temporary registers and has no side effects, so
2106 disabling the mask is harmless. */
2107 brw_push_insn_state( p );
2108 brw_set_mask_control( p, BRW_MASK_DISABLE );
2109 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2110 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2111 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2112
2113 /* We're now ready to perform the hashing. The eight hashes are
2114 interleaved for performance. The hash function used is
2115 designed to rapidly achieve avalanche and require only 16x16
2116 bit multiplication, and 8-bit swizzles (which we get for
2117 free). */
2118 for( i = 0; i < 4; i++ )
2119 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2120 for( i = 0; i < 4; i++ )
2121 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2122 odd_bytes( wtmp[ i ] ) );
2123 for( i = 0; i < 4; i++ )
2124 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2125 for( i = 0; i < 4; i++ )
2126 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2127 odd_bytes( wtmp[ i ] ) );
2128 brw_pop_insn_state( p );
2129
2130 /* Now we want to initialise the four rear gradients based on the
2131 hashes. Format conversion from signed integer to float leaves
2132 everything scaled too high by a factor of pow( 2, 15 ), but
2133 we correct for that right at the end. */
2134 /* x component */
2135 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2136 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2137 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2138 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2139 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2140
2141 brw_push_insn_state( p );
2142 brw_set_mask_control( p, BRW_MASK_DISABLE );
2143 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2144 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2145 brw_pop_insn_state( p );
2146
2147 brw_MUL( p, x1y0, x1y0, t );
2148 brw_MUL( p, x1y1, x1y1, t );
2149 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2150 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2151 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2152
2153 /* y component */
2154 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2155 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2156 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2157 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2158
2159 brw_push_insn_state( p );
2160 brw_set_mask_control( p, BRW_MASK_DISABLE );
2161 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2162 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2163 brw_pop_insn_state( p );
2164
2165 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2166 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2167 /* prepare t for the w component (used below): w the first time through
2168 the loop; w - 1 the second time) */
2169 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2170 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2171 p->current->header.predicate_inverse = 1;
2172 brw_MOV( p, t, param[ 3 ] );
2173 p->current->header.predicate_inverse = 0;
2174 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2175 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2176 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2177
2178 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2179 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2180 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2181 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2182
2183 /* z component */
2184 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2185 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2186 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2187 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2188
2189 brw_push_insn_state( p );
2190 brw_set_mask_control( p, BRW_MASK_DISABLE );
2191 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2192 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2193 brw_pop_insn_state( p );
2194
2195 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2196 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2197 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2198 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2199
2200 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2201 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2202 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2203 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2204
2205 /* w component */
2206 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2207 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2208 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2209 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2210
2211 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2212 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2213 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2214 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2215 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2216
2217 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2218 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2219 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2220 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2221
2222 /* Here we interpolate in the y dimension... */
2223 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2224 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2225 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2226 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2227 brw_ADD( p, x0y0, x0y0, x0y1 );
2228 brw_ADD( p, x1y0, x1y0, x1y1 );
2229
2230 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2231 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2232 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2233 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2234
2235 /* Now do the same thing for the front four gradients... */
2236 /* x component */
2237 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2238 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2239 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2240 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2241
2242 brw_push_insn_state( p );
2243 brw_set_mask_control( p, BRW_MASK_DISABLE );
2244 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2245 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2246 brw_pop_insn_state( p );
2247
2248 brw_MUL( p, x1y0, x1y0, t );
2249 brw_MUL( p, x1y1, x1y1, t );
2250 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2251 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2252 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2253
2254 /* y component */
2255 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2256 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2257 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2258 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2259
2260 brw_push_insn_state( p );
2261 brw_set_mask_control( p, BRW_MASK_DISABLE );
2262 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2263 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2264 brw_pop_insn_state( p );
2265
2266 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2267 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2268 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2269 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2270 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2271
2272 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2273 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2274 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2275 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2276
2277 /* z component */
2278 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2279 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2280 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2281 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2282
2283 brw_push_insn_state( p );
2284 brw_set_mask_control( p, BRW_MASK_DISABLE );
2285 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2286 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2287 brw_pop_insn_state( p );
2288
2289 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2290 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2291 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2292 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2293 /* prepare t for the w component (used below): w the first time through
2294 the loop; w - 1 the second time) */
2295 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2296 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2297 p->current->header.predicate_inverse = 1;
2298 brw_MOV( p, t, param[ 3 ] );
2299 p->current->header.predicate_inverse = 0;
2300 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2301
2302 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2303 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2304 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2305 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2306
2307 /* w component */
2308 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2309 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2310 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2311 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2312
2313 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2314 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2315 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2316 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2317
2318 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2319 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2320 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2321 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2322
2323 /* Interpolate in the y dimension: */
2324 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2325 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2326 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2327 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2328 brw_ADD( p, x0y0, x0y0, x0y1 );
2329 brw_ADD( p, x1y0, x1y0, x1y1 );
2330
2331 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2332 time put the front face in tmp[ 1 ] and we're nearly there... */
2333 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2334 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2335 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2336
2337 /* Another interpolation, in the z dimension: */
2338 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2339 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2340 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2341
2342 /* Exit the loop if we've computed both cubes... */
2343 origin = p->nr_insn;
2344 brw_push_insn_state( p );
2345 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2346 brw_set_mask_control( p, BRW_MASK_DISABLE );
2347 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2348 brw_pop_insn_state( p );
2349
2350 /* Save the result for the w=0 case, and increment the w coordinate: */
2351 brw_MOV( p, w0, tmp[ 0 ] );
2352 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2353 brw_imm_uw( 1 ) );
2354
2355 /* Loop around for the other cube. Explicitly set the flag register
2356 (unfortunately we must spend an extra instruction to do this: we
2357 can't rely on a side effect of the previous MOV or ADD because
2358 conditional modifiers which are normally true might be false in
2359 exceptional circumstances, e.g. given a NaN input; the add to
2360 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2361 brw_push_insn_state( p );
2362 brw_set_mask_control( p, BRW_MASK_DISABLE );
2363 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2364 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2365 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2366 brw_pop_insn_state( p );
2367
2368 /* Patch the previous conditional branch now that we know the
2369 destination address. */
2370 brw_set_src1( p->store + origin,
2371 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2372
2373 /* The very last interpolation. */
2374 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2375 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2376 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2377
2378 /* scale by pow( 2, -15 ), as described above */
2379 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2380
2381 release_tmps( c, mark );
2382 }
2383
2384 static void emit_noise4( struct brw_wm_compile *c,
2385 struct prog_instruction *inst )
2386 {
2387 struct brw_compile *p = &c->func;
2388 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2389 GLuint mask = inst->DstReg.WriteMask;
2390 int i;
2391 int mark = mark_tmps( c );
2392
2393 assert( mark == 0 );
2394
2395 src0 = get_src_reg( c, inst, 0, 0 );
2396 src1 = get_src_reg( c, inst, 0, 1 );
2397 src2 = get_src_reg( c, inst, 0, 2 );
2398 src3 = get_src_reg( c, inst, 0, 3 );
2399
2400 param0 = alloc_tmp( c );
2401 param1 = alloc_tmp( c );
2402 param2 = alloc_tmp( c );
2403 param3 = alloc_tmp( c );
2404
2405 brw_MOV( p, param0, src0 );
2406 brw_MOV( p, param1, src1 );
2407 brw_MOV( p, param2, src2 );
2408 brw_MOV( p, param3, src3 );
2409
2410 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2411
2412 /* Fill in the result: */
2413 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2414 for (i = 0 ; i < 4; i++) {
2415 if (mask & (1<<i)) {
2416 dst = get_dst_reg(c, inst, i);
2417 brw_MOV( p, dst, param0 );
2418 }
2419 }
2420 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2421 brw_set_saturate( p, 0 );
2422
2423 release_tmps( c, mark );
2424 }
2425
2426 static void emit_wpos_xy(struct brw_wm_compile *c,
2427 struct prog_instruction *inst)
2428 {
2429 struct brw_compile *p = &c->func;
2430 GLuint mask = inst->DstReg.WriteMask;
2431 struct brw_reg src0[2], dst[2];
2432
2433 dst[0] = get_dst_reg(c, inst, 0);
2434 dst[1] = get_dst_reg(c, inst, 1);
2435
2436 src0[0] = get_src_reg(c, inst, 0, 0);
2437 src0[1] = get_src_reg(c, inst, 0, 1);
2438
2439 /* Calculate the pixel offset from window bottom left into destination
2440 * X and Y channels.
2441 */
2442 if (mask & WRITEMASK_X) {
2443 /* X' = X - origin_x */
2444 brw_ADD(p,
2445 dst[0],
2446 retype(src0[0], BRW_REGISTER_TYPE_W),
2447 brw_imm_d(0 - c->key.origin_x));
2448 }
2449
2450 if (mask & WRITEMASK_Y) {
2451 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2452 brw_ADD(p,
2453 dst[1],
2454 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2455 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2456 }
2457 }
2458
2459 /* TODO
2460 BIAS on SIMD8 not working yet...
2461 */
2462 static void emit_txb(struct brw_wm_compile *c,
2463 struct prog_instruction *inst)
2464 {
2465 struct brw_compile *p = &c->func;
2466 struct brw_reg dst[4], src[4], payload_reg;
2467 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2468 GLuint i;
2469
2470 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2471
2472 for (i = 0; i < 4; i++)
2473 dst[i] = get_dst_reg(c, inst, i);
2474 for (i = 0; i < 4; i++)
2475 src[i] = get_src_reg(c, inst, 0, i);
2476
2477 switch (inst->TexSrcTarget) {
2478 case TEXTURE_1D_INDEX:
2479 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2480 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2481 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2482 break;
2483 case TEXTURE_2D_INDEX:
2484 case TEXTURE_RECT_INDEX:
2485 brw_MOV(p, brw_message_reg(2), src[0]);
2486 brw_MOV(p, brw_message_reg(3), src[1]);
2487 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2488 break;
2489 default:
2490 brw_MOV(p, brw_message_reg(2), src[0]);
2491 brw_MOV(p, brw_message_reg(3), src[1]);
2492 brw_MOV(p, brw_message_reg(4), src[2]);
2493 break;
2494 }
2495 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2496 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2497 brw_SAMPLE(p,
2498 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2499 1, /* msg_reg_nr */
2500 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2501 SURF_INDEX_TEXTURE(unit),
2502 unit, /* sampler */
2503 inst->DstReg.WriteMask, /* writemask */
2504 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2505 4, /* response_length */
2506 4, /* msg_length */
2507 0); /* eot */
2508 }
2509
2510
2511 static void emit_tex(struct brw_wm_compile *c,
2512 struct prog_instruction *inst)
2513 {
2514 struct brw_compile *p = &c->func;
2515 struct brw_reg dst[4], src[4], payload_reg;
2516 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2517 GLuint msg_len;
2518 GLuint i, nr;
2519 GLuint emit;
2520 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2521
2522 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2523
2524 for (i = 0; i < 4; i++)
2525 dst[i] = get_dst_reg(c, inst, i);
2526 for (i = 0; i < 4; i++)
2527 src[i] = get_src_reg(c, inst, 0, i);
2528
2529 switch (inst->TexSrcTarget) {
2530 case TEXTURE_1D_INDEX:
2531 emit = WRITEMASK_X;
2532 nr = 1;
2533 break;
2534 case TEXTURE_2D_INDEX:
2535 case TEXTURE_RECT_INDEX:
2536 emit = WRITEMASK_XY;
2537 nr = 2;
2538 break;
2539 default:
2540 emit = WRITEMASK_XYZ;
2541 nr = 3;
2542 break;
2543 }
2544 msg_len = 1;
2545
2546 /* move/load S, T, R coords */
2547 for (i = 0; i < nr; i++) {
2548 static const GLuint swz[4] = {0,1,2,2};
2549 if (emit & (1<<i))
2550 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2551 else
2552 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2553 msg_len += 1;
2554 }
2555
2556 if (shadow) {
2557 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2558 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2559 }
2560
2561 brw_SAMPLE(p,
2562 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2563 1, /* msg_reg_nr */
2564 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2565 SURF_INDEX_TEXTURE(unit),
2566 unit, /* sampler */
2567 inst->DstReg.WriteMask, /* writemask */
2568 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2569 4, /* response_length */
2570 shadow ? 6 : 4, /* msg_length */
2571 0); /* eot */
2572
2573 if (shadow)
2574 brw_MOV(p, dst[3], brw_imm_f(1.0));
2575 }
2576
2577
2578 /**
2579 * Resolve subroutine calls after code emit is done.
2580 */
2581 static void post_wm_emit( struct brw_wm_compile *c )
2582 {
2583 brw_resolve_cals(&c->func);
2584 }
2585
2586 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2587 {
2588 #define MAX_IFSN 32
2589 #define MAX_LOOP_DEPTH 32
2590 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2591 struct brw_instruction *inst0, *inst1;
2592 int i, if_insn = 0, loop_insn = 0;
2593 struct brw_compile *p = &c->func;
2594 struct brw_indirect stack_index = brw_indirect(0, 0);
2595
2596 c->reg_index = 0;
2597 prealloc_reg(c);
2598 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2599 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2600
2601 for (i = 0; i < c->nr_fp_insns; i++) {
2602 struct prog_instruction *inst = &c->prog_instructions[i];
2603
2604 #if 0
2605 _mesa_printf("Inst %d: ", i);
2606 _mesa_print_instruction(inst);
2607 #endif
2608
2609 /* fetch any constants that this instruction needs */
2610 if (c->use_const_buffer)
2611 fetch_constants(c, inst);
2612
2613 if (inst->CondUpdate)
2614 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2615 else
2616 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2617
2618 switch (inst->Opcode) {
2619 case WM_PIXELXY:
2620 emit_pixel_xy(c, inst);
2621 break;
2622 case WM_DELTAXY:
2623 emit_delta_xy(c, inst);
2624 break;
2625 case WM_PIXELW:
2626 emit_pixel_w(c, inst);
2627 break;
2628 case WM_LINTERP:
2629 emit_linterp(c, inst);
2630 break;
2631 case WM_PINTERP:
2632 emit_pinterp(c, inst);
2633 break;
2634 case WM_CINTERP:
2635 emit_cinterp(c, inst);
2636 break;
2637 case WM_WPOSXY:
2638 emit_wpos_xy(c, inst);
2639 break;
2640 case WM_FB_WRITE:
2641 emit_fb_write(c, inst);
2642 break;
2643 case WM_FRONTFACING:
2644 emit_frontfacing(c, inst);
2645 break;
2646 case OPCODE_ABS:
2647 emit_abs(c, inst);
2648 break;
2649 case OPCODE_ADD:
2650 emit_add(c, inst);
2651 break;
2652 case OPCODE_ARL:
2653 emit_arl(c, inst);
2654 break;
2655 case OPCODE_SUB:
2656 emit_sub(c, inst);
2657 break;
2658 case OPCODE_FRC:
2659 emit_frc(c, inst);
2660 break;
2661 case OPCODE_FLR:
2662 emit_flr(c, inst);
2663 break;
2664 case OPCODE_LRP:
2665 emit_lrp(c, inst);
2666 break;
2667 case OPCODE_TRUNC:
2668 emit_trunc(c, inst);
2669 break;
2670 case OPCODE_MOV:
2671 emit_mov(c, inst);
2672 break;
2673 case OPCODE_DP3:
2674 emit_dp3(c, inst);
2675 break;
2676 case OPCODE_DP4:
2677 emit_dp4(c, inst);
2678 break;
2679 case OPCODE_XPD:
2680 emit_xpd(c, inst);
2681 break;
2682 case OPCODE_DPH:
2683 emit_dph(c, inst);
2684 break;
2685 case OPCODE_RCP:
2686 emit_rcp(c, inst);
2687 break;
2688 case OPCODE_RSQ:
2689 emit_rsq(c, inst);
2690 break;
2691 case OPCODE_SIN:
2692 emit_sin(c, inst);
2693 break;
2694 case OPCODE_COS:
2695 emit_cos(c, inst);
2696 break;
2697 case OPCODE_EX2:
2698 emit_ex2(c, inst);
2699 break;
2700 case OPCODE_LG2:
2701 emit_lg2(c, inst);
2702 break;
2703 case OPCODE_MIN:
2704 case OPCODE_MAX:
2705 emit_min_max(c, inst);
2706 break;
2707 case OPCODE_DDX:
2708 emit_ddx(c, inst);
2709 break;
2710 case OPCODE_DDY:
2711 emit_ddy(c, inst);
2712 break;
2713 case OPCODE_SLT:
2714 emit_slt(c, inst);
2715 break;
2716 case OPCODE_SLE:
2717 emit_sle(c, inst);
2718 break;
2719 case OPCODE_SGT:
2720 emit_sgt(c, inst);
2721 break;
2722 case OPCODE_SGE:
2723 emit_sge(c, inst);
2724 break;
2725 case OPCODE_SEQ:
2726 emit_seq(c, inst);
2727 break;
2728 case OPCODE_SNE:
2729 emit_sne(c, inst);
2730 break;
2731 case OPCODE_MUL:
2732 emit_mul(c, inst);
2733 break;
2734 case OPCODE_POW:
2735 emit_pow(c, inst);
2736 break;
2737 case OPCODE_MAD:
2738 emit_mad(c, inst);
2739 break;
2740 case OPCODE_NOISE1:
2741 emit_noise1(c, inst);
2742 break;
2743 case OPCODE_NOISE2:
2744 emit_noise2(c, inst);
2745 break;
2746 case OPCODE_NOISE3:
2747 emit_noise3(c, inst);
2748 break;
2749 case OPCODE_NOISE4:
2750 emit_noise4(c, inst);
2751 break;
2752 case OPCODE_TEX:
2753 emit_tex(c, inst);
2754 break;
2755 case OPCODE_TXB:
2756 emit_txb(c, inst);
2757 break;
2758 case OPCODE_KIL_NV:
2759 emit_kil(c);
2760 break;
2761 case OPCODE_IF:
2762 assert(if_insn < MAX_IFSN);
2763 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2764 break;
2765 case OPCODE_ELSE:
2766 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2767 break;
2768 case OPCODE_ENDIF:
2769 assert(if_insn > 0);
2770 brw_ENDIF(p, if_inst[--if_insn]);
2771 break;
2772 case OPCODE_BGNSUB:
2773 brw_save_label(p, inst->Comment, p->nr_insn);
2774 break;
2775 case OPCODE_ENDSUB:
2776 /* no-op */
2777 break;
2778 case OPCODE_CAL:
2779 brw_push_insn_state(p);
2780 brw_set_mask_control(p, BRW_MASK_DISABLE);
2781 brw_set_access_mode(p, BRW_ALIGN_1);
2782 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2783 brw_set_access_mode(p, BRW_ALIGN_16);
2784 brw_ADD(p, get_addr_reg(stack_index),
2785 get_addr_reg(stack_index), brw_imm_d(4));
2786 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2787 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2788 brw_pop_insn_state(p);
2789 break;
2790
2791 case OPCODE_RET:
2792 brw_push_insn_state(p);
2793 brw_set_mask_control(p, BRW_MASK_DISABLE);
2794 brw_ADD(p, get_addr_reg(stack_index),
2795 get_addr_reg(stack_index), brw_imm_d(-4));
2796 brw_set_access_mode(p, BRW_ALIGN_1);
2797 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2798 brw_set_access_mode(p, BRW_ALIGN_16);
2799 brw_pop_insn_state(p);
2800
2801 break;
2802 case OPCODE_BGNLOOP:
2803 /* XXX may need to invalidate the current_constant regs */
2804 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2805 break;
2806 case OPCODE_BRK:
2807 brw_BREAK(p);
2808 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2809 break;
2810 case OPCODE_CONT:
2811 brw_CONT(p);
2812 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2813 break;
2814 case OPCODE_ENDLOOP:
2815 loop_insn--;
2816 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2817 /* patch all the BREAK instructions from
2818 last BEGINLOOP */
2819 while (inst0 > loop_inst[loop_insn]) {
2820 inst0--;
2821 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2822 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2823 inst0->bits3.if_else.pop_count = 0;
2824 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2825 inst0->bits3.if_else.jump_count = inst1 - inst0;
2826 inst0->bits3.if_else.pop_count = 0;
2827 }
2828 }
2829 break;
2830 default:
2831 _mesa_printf("unsupported IR in fragment shader %d\n",
2832 inst->Opcode);
2833 }
2834 if (inst->CondUpdate)
2835 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2836 else
2837 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2838 }
2839 post_wm_emit(c);
2840
2841 if (c->reg_index >= BRW_WM_MAX_GRF) {
2842 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2843 /* XXX we need to do some proper error recovery here */
2844 }
2845 }
2846
2847
2848 /**
2849 * Do GPU code generation for shaders that use GLSL features such as
2850 * flow control. Other shaders will be compiled with the
2851 */
2852 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2853 {
2854 if (INTEL_DEBUG & DEBUG_WM) {
2855 _mesa_printf("brw_wm_glsl_emit:\n");
2856 }
2857
2858 /* initial instruction translation/simplification */
2859 brw_wm_pass_fp(c);
2860
2861 /* actual code generation */
2862 brw_wm_emit_glsl(brw, c);
2863
2864 if (INTEL_DEBUG & DEBUG_WM) {
2865 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2866 }
2867
2868 c->prog_data.total_grf = c->reg_index;
2869 c->prog_data.total_scratch = 0;
2870 }