i965: change args to get_src_reg() to prep for new constant buffer support
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195
196 if (1 /* XXX threshold: nr_params <= 8 */) {
197 const struct gl_program_parameter_list *plist =
198 c->fp->program.Base.Parameters;
199 int index = 0;
200
201 /* number of float constants in CURBE */
202 c->prog_data.nr_params = 4 * nr_params;
203
204 /* loop over program constants (float[4]) */
205 for (i = 0; i < nr_params; i++) {
206 /* loop over XYZW channels */
207 for (j = 0; j < 4; j++, index++) {
208 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
209 /* Save pointer to parameter/constant value.
210 * Constants will be copied in prepare_constant_buffer()
211 */
212 c->prog_data.param[index] = &plist->ParameterValues[i][j];
213 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
214 }
215 }
216 /* number of constant regs used (each reg is float[8]) */
217 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
218 c->reg_index += c->nr_creg;
219 }
220 else {
221 /* number of float constants in CURBE */
222 c->prog_data.nr_params = 0;
223
224 /* When there's a lot of FP constanst we'll store them in a
225 * texture-like buffer instead of using the CURBE buffer.
226 * This means we won't use GRF registers for constants and we'll
227 * have to fetch constants with a dataport read.
228 */
229 }
230 }
231
232 /* fragment shader inputs */
233 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
234 if (inputs & (1<<i)) {
235 nr_interp_regs++;
236 reg = brw_vec8_grf(c->reg_index, 0);
237 for (j = 0; j < 4; j++)
238 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
239 c->reg_index += 2;
240 }
241 }
242
243 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
244 c->prog_data.urb_read_length = nr_interp_regs * 2;
245 c->prog_data.curb_read_length = c->nr_creg;
246 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
247 c->reg_index++;
248 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
249 c->reg_index += 2;
250 }
251
252
253 /**
254 * Convert Mesa dst register to brw register.
255 */
256 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
257 const struct prog_instruction *inst,
258 GLuint component)
259 {
260 const int nr = 1;
261 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
262 0, 0);
263 }
264
265
266 /**
267 * Convert Mesa src register to brw register.
268 */
269 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
270 const struct prog_instruction *inst,
271 GLuint srcRegIndex, GLuint channel)
272 {
273 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
274 const GLuint nr = 1;
275 const GLuint component = GET_SWZ(src->Swizzle, channel);
276
277 return get_reg(c, src->File, src->Index, component, nr,
278 src->NegateBase, src->Abs);
279 }
280
281
282 /**
283 * Same as \sa get_src_reg() but if the register is a literal, emit
284 * a brw_reg encoding the literal.
285 * Note that a brw instruction only allows one src operand to be a literal.
286 * For instructions with more than one operand, only the second can be a literal.
287 */
288 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
289 const struct prog_instruction *inst,
290 GLuint srcRegIndex, GLuint channel)
291 {
292 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
293 if (src->File == PROGRAM_CONSTANT) {
294 /* a literal */
295 const int component = GET_SWZ(src->Swizzle, channel);
296 const GLfloat *param =
297 c->fp->program.Base.Parameters->ParameterValues[src->Index];
298 GLfloat value = param[component];
299 if (src->NegateBase)
300 value = -value;
301 if (src->Abs)
302 value = FABSF(value);
303 return brw_imm_f(value);
304 }
305 else {
306 return get_src_reg(c, inst, srcRegIndex, channel);
307 }
308 }
309
310
311 /**
312 * Subroutines are minimal support for resusable instruction sequences.
313 * They are implemented as simply as possible to minimise overhead: there
314 * is no explicit support for communication between the caller and callee
315 * other than saving the return address in a temporary register, nor is
316 * there any automatic local storage. This implies that great care is
317 * required before attempting reentrancy or any kind of nested
318 * subroutine invocations.
319 */
320 static void invoke_subroutine( struct brw_wm_compile *c,
321 enum _subroutine subroutine,
322 void (*emit)( struct brw_wm_compile * ) )
323 {
324 struct brw_compile *p = &c->func;
325
326 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
327
328 if( c->subroutines[ subroutine ] ) {
329 /* subroutine previously emitted: reuse existing instructions */
330
331 int mark = mark_tmps( c );
332 struct brw_reg return_address = retype( alloc_tmp( c ),
333 BRW_REGISTER_TYPE_UD );
334 int here = p->nr_insn;
335
336 brw_push_insn_state(p);
337 brw_set_mask_control(p, BRW_MASK_DISABLE);
338 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
339
340 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
341 brw_imm_d( ( c->subroutines[ subroutine ] -
342 here - 1 ) << 4 ) );
343 brw_pop_insn_state(p);
344
345 release_tmps( c, mark );
346 } else {
347 /* previously unused subroutine: emit, and mark for later reuse */
348
349 int mark = mark_tmps( c );
350 struct brw_reg return_address = retype( alloc_tmp( c ),
351 BRW_REGISTER_TYPE_UD );
352 struct brw_instruction *calc;
353 int base = p->nr_insn;
354
355 brw_push_insn_state(p);
356 brw_set_mask_control(p, BRW_MASK_DISABLE);
357 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
358 brw_pop_insn_state(p);
359
360 c->subroutines[ subroutine ] = p->nr_insn;
361
362 emit( c );
363
364 brw_push_insn_state(p);
365 brw_set_mask_control(p, BRW_MASK_DISABLE);
366 brw_MOV( p, brw_ip_reg(), return_address );
367 brw_pop_insn_state(p);
368
369 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
370
371 release_tmps( c, mark );
372 }
373 }
374
375 static void emit_abs( struct brw_wm_compile *c,
376 struct prog_instruction *inst)
377 {
378 int i;
379 struct brw_compile *p = &c->func;
380 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
381 for (i = 0; i < 4; i++) {
382 if (inst->DstReg.WriteMask & (1<<i)) {
383 struct brw_reg src, dst;
384 dst = get_dst_reg(c, inst, i);
385 src = get_src_reg(c, inst, 0, i);
386 brw_MOV(p, dst, brw_abs(src));
387 }
388 }
389 brw_set_saturate(p, 0);
390 }
391
392 static void emit_trunc( struct brw_wm_compile *c,
393 struct prog_instruction *inst)
394 {
395 int i;
396 struct brw_compile *p = &c->func;
397 GLuint mask = inst->DstReg.WriteMask;
398 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
399 for (i = 0; i < 4; i++) {
400 if (mask & (1<<i)) {
401 struct brw_reg src, dst;
402 dst = get_dst_reg(c, inst, i);
403 src = get_src_reg(c, inst, 0, i);
404 brw_RNDZ(p, dst, src);
405 }
406 }
407 brw_set_saturate(p, 0);
408 }
409
410 static void emit_mov( struct brw_wm_compile *c,
411 struct prog_instruction *inst)
412 {
413 int i;
414 struct brw_compile *p = &c->func;
415 GLuint mask = inst->DstReg.WriteMask;
416 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
417 for (i = 0; i < 4; i++) {
418 if (mask & (1<<i)) {
419 struct brw_reg src, dst;
420 dst = get_dst_reg(c, inst, i);
421 src = get_src_reg_imm(c, inst, 0, i);
422 brw_MOV(p, dst, src);
423 }
424 }
425 brw_set_saturate(p, 0);
426 }
427
428 static void emit_pixel_xy(struct brw_wm_compile *c,
429 struct prog_instruction *inst)
430 {
431 struct brw_reg r1 = brw_vec1_grf(1, 0);
432 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
433
434 struct brw_reg dst0, dst1;
435 struct brw_compile *p = &c->func;
436 GLuint mask = inst->DstReg.WriteMask;
437
438 dst0 = get_dst_reg(c, inst, 0);
439 dst1 = get_dst_reg(c, inst, 1);
440 /* Calculate pixel centers by adding 1 or 0 to each of the
441 * micro-tile coordinates passed in r1.
442 */
443 if (mask & WRITEMASK_X) {
444 brw_ADD(p,
445 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
446 stride(suboffset(r1_uw, 4), 2, 4, 0),
447 brw_imm_v(0x10101010));
448 }
449
450 if (mask & WRITEMASK_Y) {
451 brw_ADD(p,
452 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
453 stride(suboffset(r1_uw, 5), 2, 4, 0),
454 brw_imm_v(0x11001100));
455 }
456 }
457
458 static void emit_delta_xy(struct brw_wm_compile *c,
459 struct prog_instruction *inst)
460 {
461 struct brw_reg r1 = brw_vec1_grf(1, 0);
462 struct brw_reg dst0, dst1, src0, src1;
463 struct brw_compile *p = &c->func;
464 GLuint mask = inst->DstReg.WriteMask;
465
466 dst0 = get_dst_reg(c, inst, 0);
467 dst1 = get_dst_reg(c, inst, 1);
468 src0 = get_src_reg(c, inst, 0, 0);
469 src1 = get_src_reg(c, inst, 0, 1);
470 /* Calc delta X,Y by subtracting origin in r1 from the pixel
471 * centers.
472 */
473 if (mask & WRITEMASK_X) {
474 brw_ADD(p,
475 dst0,
476 retype(src0, BRW_REGISTER_TYPE_UW),
477 negate(r1));
478 }
479
480 if (mask & WRITEMASK_Y) {
481 brw_ADD(p,
482 dst1,
483 retype(src1, BRW_REGISTER_TYPE_UW),
484 negate(suboffset(r1,1)));
485
486 }
487 }
488
489 static void fire_fb_write( struct brw_wm_compile *c,
490 GLuint base_reg,
491 GLuint nr,
492 GLuint target,
493 GLuint eot)
494 {
495 struct brw_compile *p = &c->func;
496 /* Pass through control information:
497 */
498 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
499 {
500 brw_push_insn_state(p);
501 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
502 brw_MOV(p,
503 brw_message_reg(base_reg + 1),
504 brw_vec8_grf(1, 0));
505 brw_pop_insn_state(p);
506 }
507 /* Send framebuffer write message: */
508 brw_fb_WRITE(p,
509 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
510 base_reg,
511 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
512 target,
513 nr,
514 0,
515 eot);
516 }
517
518 static void emit_fb_write(struct brw_wm_compile *c,
519 struct prog_instruction *inst)
520 {
521 struct brw_compile *p = &c->func;
522 int nr = 2;
523 int channel;
524 GLuint target, eot;
525 struct brw_reg src0;
526
527 /* Reserve a space for AA - may not be needed:
528 */
529 if (c->key.aa_dest_stencil_reg)
530 nr += 1;
531
532 brw_push_insn_state(p);
533 for (channel = 0; channel < 4; channel++) {
534 src0 = get_src_reg(c, inst, 0, channel);
535 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
536 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
537 brw_MOV(p, brw_message_reg(nr + channel), src0);
538 }
539 /* skip over the regs populated above: */
540 nr += 8;
541 brw_pop_insn_state(p);
542
543 if (c->key.source_depth_to_render_target) {
544 if (c->key.computes_depth) {
545 src0 = get_src_reg(c, inst, 2, 2);
546 brw_MOV(p, brw_message_reg(nr), src0);
547 }
548 else {
549 src0 = get_src_reg(c, inst, 1, 1);
550 brw_MOV(p, brw_message_reg(nr), src0);
551 }
552
553 nr += 2;
554 }
555
556 if (c->key.dest_depth_reg) {
557 GLuint comp = c->key.dest_depth_reg / 2;
558 GLuint off = c->key.dest_depth_reg % 2;
559
560 assert(comp == 1);
561 assert(off == 0);
562 #if 0
563 /* XXX do we need this code? comp always 1, off always 0, it seems */
564 if (off != 0) {
565 brw_push_insn_state(p);
566 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
567
568 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
569 /* 2nd half? */
570 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
571 brw_pop_insn_state(p);
572 }
573 else
574 #endif
575 {
576 struct brw_reg src = get_src_reg(c, inst, 1, 1);
577 brw_MOV(p, brw_message_reg(nr), src);
578 }
579 nr += 2;
580 }
581
582 target = inst->Aux >> 1;
583 eot = inst->Aux & 1;
584 fire_fb_write(c, 0, nr, target, eot);
585 }
586
587 static void emit_pixel_w( struct brw_wm_compile *c,
588 struct prog_instruction *inst)
589 {
590 struct brw_compile *p = &c->func;
591 GLuint mask = inst->DstReg.WriteMask;
592 if (mask & WRITEMASK_W) {
593 struct brw_reg dst, src0, delta0, delta1;
594 struct brw_reg interp3;
595
596 dst = get_dst_reg(c, inst, 3);
597 src0 = get_src_reg(c, inst, 0, 0);
598 delta0 = get_src_reg(c, inst, 1, 0);
599 delta1 = get_src_reg(c, inst, 1, 1);
600
601 interp3 = brw_vec1_grf(src0.nr+1, 4);
602 /* Calc 1/w - just linterp wpos[3] optimized by putting the
603 * result straight into a message reg.
604 */
605 brw_LINE(p, brw_null_reg(), interp3, delta0);
606 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
607
608 /* Calc w */
609 brw_math_16( p, dst,
610 BRW_MATH_FUNCTION_INV,
611 BRW_MATH_SATURATE_NONE,
612 2, brw_null_reg(),
613 BRW_MATH_PRECISION_FULL);
614 }
615 }
616
617 static void emit_linterp(struct brw_wm_compile *c,
618 struct prog_instruction *inst)
619 {
620 struct brw_compile *p = &c->func;
621 GLuint mask = inst->DstReg.WriteMask;
622 struct brw_reg interp[4];
623 struct brw_reg dst, delta0, delta1;
624 struct brw_reg src0;
625 GLuint nr, i;
626
627 src0 = get_src_reg(c, inst, 0, 0);
628 delta0 = get_src_reg(c, inst, 1, 0);
629 delta1 = get_src_reg(c, inst, 1, 1);
630 nr = src0.nr;
631
632 interp[0] = brw_vec1_grf(nr, 0);
633 interp[1] = brw_vec1_grf(nr, 4);
634 interp[2] = brw_vec1_grf(nr+1, 0);
635 interp[3] = brw_vec1_grf(nr+1, 4);
636
637 for(i = 0; i < 4; i++ ) {
638 if (mask & (1<<i)) {
639 dst = get_dst_reg(c, inst, i);
640 brw_LINE(p, brw_null_reg(), interp[i], delta0);
641 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
642 }
643 }
644 }
645
646 static void emit_cinterp(struct brw_wm_compile *c,
647 struct prog_instruction *inst)
648 {
649 struct brw_compile *p = &c->func;
650 GLuint mask = inst->DstReg.WriteMask;
651
652 struct brw_reg interp[4];
653 struct brw_reg dst, src0;
654 GLuint nr, i;
655
656 src0 = get_src_reg(c, inst, 0, 0);
657 nr = src0.nr;
658
659 interp[0] = brw_vec1_grf(nr, 0);
660 interp[1] = brw_vec1_grf(nr, 4);
661 interp[2] = brw_vec1_grf(nr+1, 0);
662 interp[3] = brw_vec1_grf(nr+1, 4);
663
664 for(i = 0; i < 4; i++ ) {
665 if (mask & (1<<i)) {
666 dst = get_dst_reg(c, inst, i);
667 brw_MOV(p, dst, suboffset(interp[i],3));
668 }
669 }
670 }
671
672 static void emit_pinterp(struct brw_wm_compile *c,
673 struct prog_instruction *inst)
674 {
675 struct brw_compile *p = &c->func;
676 GLuint mask = inst->DstReg.WriteMask;
677
678 struct brw_reg interp[4];
679 struct brw_reg dst, delta0, delta1;
680 struct brw_reg src0, w;
681 GLuint nr, i;
682
683 src0 = get_src_reg(c, inst, 0, 0);
684 delta0 = get_src_reg(c, inst, 1, 0);
685 delta1 = get_src_reg(c, inst, 1, 1);
686 w = get_src_reg(c, inst, 2, 3);
687 nr = src0.nr;
688
689 interp[0] = brw_vec1_grf(nr, 0);
690 interp[1] = brw_vec1_grf(nr, 4);
691 interp[2] = brw_vec1_grf(nr+1, 0);
692 interp[3] = brw_vec1_grf(nr+1, 4);
693
694 for(i = 0; i < 4; i++ ) {
695 if (mask & (1<<i)) {
696 dst = get_dst_reg(c, inst, i);
697 brw_LINE(p, brw_null_reg(), interp[i], delta0);
698 brw_MAC(p, dst, suboffset(interp[i],1),
699 delta1);
700 brw_MUL(p, dst, dst, w);
701 }
702 }
703 }
704
705 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
706 static void emit_frontfacing(struct brw_wm_compile *c,
707 struct prog_instruction *inst)
708 {
709 struct brw_compile *p = &c->func;
710 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
711 struct brw_reg dst;
712 GLuint mask = inst->DstReg.WriteMask;
713 int i;
714
715 for (i = 0; i < 4; i++) {
716 if (mask & (1<<i)) {
717 dst = get_dst_reg(c, inst, i);
718 brw_MOV(p, dst, brw_imm_f(0.0));
719 }
720 }
721
722 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
723 * us front face
724 */
725 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
726 for (i = 0; i < 4; i++) {
727 if (mask & (1<<i)) {
728 dst = get_dst_reg(c, inst, i);
729 brw_MOV(p, dst, brw_imm_f(1.0));
730 }
731 }
732 brw_set_predicate_control_flag_value(p, 0xff);
733 }
734
735 static void emit_xpd(struct brw_wm_compile *c,
736 struct prog_instruction *inst)
737 {
738 int i;
739 struct brw_compile *p = &c->func;
740 GLuint mask = inst->DstReg.WriteMask;
741 for (i = 0; i < 4; i++) {
742 GLuint i2 = (i+2)%3;
743 GLuint i1 = (i+1)%3;
744 if (mask & (1<<i)) {
745 struct brw_reg src0, src1, dst;
746 dst = get_dst_reg(c, inst, i);
747 src0 = negate(get_src_reg(c, inst, 0, i2));
748 src1 = get_src_reg_imm(c, inst, 1, i1);
749 brw_MUL(p, brw_null_reg(), src0, src1);
750 src0 = get_src_reg(c, inst, 0, i1);
751 src1 = get_src_reg_imm(c, inst, 1, i2);
752 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
753 brw_MAC(p, dst, src0, src1);
754 brw_set_saturate(p, 0);
755 }
756 }
757 brw_set_saturate(p, 0);
758 }
759
760 static void emit_dp3(struct brw_wm_compile *c,
761 struct prog_instruction *inst)
762 {
763 struct brw_reg src0[3], src1[3], dst;
764 int i;
765 struct brw_compile *p = &c->func;
766 for (i = 0; i < 3; i++) {
767 src0[i] = get_src_reg(c, inst, 0, i);
768 src1[i] = get_src_reg_imm(c, inst, 1, i);
769 }
770
771 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
772 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
773 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
774 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
775 brw_MAC(p, dst, src0[2], src1[2]);
776 brw_set_saturate(p, 0);
777 }
778
779 static void emit_dp4(struct brw_wm_compile *c,
780 struct prog_instruction *inst)
781 {
782 struct brw_reg src0[4], src1[4], dst;
783 int i;
784 struct brw_compile *p = &c->func;
785 for (i = 0; i < 4; i++) {
786 src0[i] = get_src_reg(c, inst, 0, i);
787 src1[i] = get_src_reg_imm(c, inst, 1, i);
788 }
789 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
790 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
791 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
792 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
793 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
794 brw_MAC(p, dst, src0[3], src1[3]);
795 brw_set_saturate(p, 0);
796 }
797
798 static void emit_dph(struct brw_wm_compile *c,
799 struct prog_instruction *inst)
800 {
801 struct brw_reg src0[4], src1[4], dst;
802 int i;
803 struct brw_compile *p = &c->func;
804 for (i = 0; i < 4; i++) {
805 src0[i] = get_src_reg(c, inst, 0, i);
806 src1[i] = get_src_reg_imm(c, inst, 1, i);
807 }
808 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
809 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
810 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
811 brw_MAC(p, dst, src0[2], src1[2]);
812 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
813 brw_ADD(p, dst, dst, src1[3]);
814 brw_set_saturate(p, 0);
815 }
816
817 /**
818 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
819 * Note that the result of the function is smeared across the dest
820 * register's X, Y, Z and W channels (subject to writemasking of course).
821 */
822 static void emit_math1(struct brw_wm_compile *c,
823 struct prog_instruction *inst, GLuint func)
824 {
825 struct brw_compile *p = &c->func;
826 struct brw_reg src0, dst, tmp;
827 const int mark = mark_tmps( c );
828 int i;
829
830 tmp = alloc_tmp(c);
831
832 /* Get first component of source register */
833 src0 = get_src_reg(c, inst, 0, 0);
834
835 /* tmp = func(src0) */
836 brw_MOV(p, brw_message_reg(2), src0);
837 brw_math(p,
838 tmp,
839 func,
840 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
841 2,
842 brw_null_reg(),
843 BRW_MATH_DATA_VECTOR,
844 BRW_MATH_PRECISION_FULL);
845
846 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
847
848 /* replicate tmp value across enabled dest channels */
849 for (i = 0; i < 4; i++) {
850 if (inst->DstReg.WriteMask & (1 << i)) {
851 dst = get_dst_reg(c, inst, i);
852 brw_MOV(p, dst, tmp);
853 }
854 }
855
856 release_tmps(c, mark);
857 }
858
859 static void emit_rcp(struct brw_wm_compile *c,
860 struct prog_instruction *inst)
861 {
862 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
863 }
864
865 static void emit_rsq(struct brw_wm_compile *c,
866 struct prog_instruction *inst)
867 {
868 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
869 }
870
871 static void emit_sin(struct brw_wm_compile *c,
872 struct prog_instruction *inst)
873 {
874 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
875 }
876
877 static void emit_cos(struct brw_wm_compile *c,
878 struct prog_instruction *inst)
879 {
880 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
881 }
882
883 static void emit_ex2(struct brw_wm_compile *c,
884 struct prog_instruction *inst)
885 {
886 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
887 }
888
889 static void emit_lg2(struct brw_wm_compile *c,
890 struct prog_instruction *inst)
891 {
892 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
893 }
894
895 static void emit_add(struct brw_wm_compile *c,
896 struct prog_instruction *inst)
897 {
898 struct brw_compile *p = &c->func;
899 struct brw_reg src0, src1, dst;
900 GLuint mask = inst->DstReg.WriteMask;
901 int i;
902 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
903 for (i = 0 ; i < 4; i++) {
904 if (mask & (1<<i)) {
905 dst = get_dst_reg(c, inst, i);
906 src0 = get_src_reg(c, inst, 0, i);
907 src1 = get_src_reg_imm(c, inst, 1, i);
908 brw_ADD(p, dst, src0, src1);
909 }
910 }
911 brw_set_saturate(p, 0);
912 }
913
914 static void emit_arl(struct brw_wm_compile *c,
915 struct prog_instruction *inst)
916 {
917 struct brw_compile *p = &c->func;
918 struct brw_reg src0, addr_reg;
919 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
920 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
921 BRW_ARF_ADDRESS, 0);
922 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
923 brw_MOV(p, addr_reg, src0);
924 brw_set_saturate(p, 0);
925 }
926
927 static void emit_sub(struct brw_wm_compile *c,
928 struct prog_instruction *inst)
929 {
930 struct brw_compile *p = &c->func;
931 struct brw_reg src0, src1, dst;
932 GLuint mask = inst->DstReg.WriteMask;
933 int i;
934 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
935 for (i = 0 ; i < 4; i++) {
936 if (mask & (1<<i)) {
937 dst = get_dst_reg(c, inst, i);
938 src0 = get_src_reg(c, inst, 0, i);
939 src1 = get_src_reg_imm(c, inst, 1, i);
940 brw_ADD(p, dst, src0, negate(src1));
941 }
942 }
943 brw_set_saturate(p, 0);
944 }
945
946 static void emit_mul(struct brw_wm_compile *c,
947 struct prog_instruction *inst)
948 {
949 struct brw_compile *p = &c->func;
950 struct brw_reg src0, src1, dst;
951 GLuint mask = inst->DstReg.WriteMask;
952 int i;
953 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
954 for (i = 0 ; i < 4; i++) {
955 if (mask & (1<<i)) {
956 dst = get_dst_reg(c, inst, i);
957 src0 = get_src_reg(c, inst, 0, i);
958 src1 = get_src_reg_imm(c, inst, 1, i);
959 brw_MUL(p, dst, src0, src1);
960 }
961 }
962 brw_set_saturate(p, 0);
963 }
964
965 static void emit_frc(struct brw_wm_compile *c,
966 struct prog_instruction *inst)
967 {
968 struct brw_compile *p = &c->func;
969 struct brw_reg src0, dst;
970 GLuint mask = inst->DstReg.WriteMask;
971 int i;
972 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
973 for (i = 0 ; i < 4; i++) {
974 if (mask & (1<<i)) {
975 dst = get_dst_reg(c, inst, i);
976 src0 = get_src_reg_imm(c, inst, 0, i);
977 brw_FRC(p, dst, src0);
978 }
979 }
980 if (inst->SaturateMode != SATURATE_OFF)
981 brw_set_saturate(p, 0);
982 }
983
984 static void emit_flr(struct brw_wm_compile *c,
985 struct prog_instruction *inst)
986 {
987 struct brw_compile *p = &c->func;
988 struct brw_reg src0, dst;
989 GLuint mask = inst->DstReg.WriteMask;
990 int i;
991 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
992 for (i = 0 ; i < 4; i++) {
993 if (mask & (1<<i)) {
994 dst = get_dst_reg(c, inst, i);
995 src0 = get_src_reg_imm(c, inst, 0, i);
996 brw_RNDD(p, dst, src0);
997 }
998 }
999 brw_set_saturate(p, 0);
1000 }
1001
1002 static void emit_max(struct brw_wm_compile *c,
1003 struct prog_instruction *inst)
1004 {
1005 struct brw_compile *p = &c->func;
1006 GLuint mask = inst->DstReg.WriteMask;
1007 struct brw_reg src0, src1, dst;
1008 int i;
1009 brw_push_insn_state(p);
1010 for (i = 0; i < 4; i++) {
1011 if (mask & (1<<i)) {
1012 dst = get_dst_reg(c, inst, i);
1013 src0 = get_src_reg(c, inst, 0, i);
1014 src1 = get_src_reg_imm(c, inst, 1, i);
1015 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1016 brw_MOV(p, dst, src0);
1017 brw_set_saturate(p, 0);
1018
1019 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
1020 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1021 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1022 brw_MOV(p, dst, src1);
1023 brw_set_saturate(p, 0);
1024 brw_set_predicate_control_flag_value(p, 0xff);
1025 }
1026 }
1027 brw_pop_insn_state(p);
1028 }
1029
1030 static void emit_min(struct brw_wm_compile *c,
1031 struct prog_instruction *inst)
1032 {
1033 struct brw_compile *p = &c->func;
1034 GLuint mask = inst->DstReg.WriteMask;
1035 struct brw_reg src0, src1, dst;
1036 int i;
1037 brw_push_insn_state(p);
1038 for (i = 0; i < 4; i++) {
1039 if (mask & (1<<i)) {
1040 dst = get_dst_reg(c, inst, i);
1041 src0 = get_src_reg_imm(c, inst, 0, i);
1042 src1 = get_src_reg(c, inst, 1, i);
1043 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1044 brw_MOV(p, dst, src0);
1045 brw_set_saturate(p, 0);
1046
1047 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1048 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1049 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1050 brw_MOV(p, dst, src1);
1051 brw_set_saturate(p, 0);
1052 brw_set_predicate_control_flag_value(p, 0xff);
1053 }
1054 }
1055 brw_pop_insn_state(p);
1056 }
1057
1058 static void emit_pow(struct brw_wm_compile *c,
1059 struct prog_instruction *inst)
1060 {
1061 struct brw_compile *p = &c->func;
1062 struct brw_reg dst, src0, src1;
1063 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1064 src0 = get_src_reg_imm(c, inst, 0, 0);
1065 src1 = get_src_reg_imm(c, inst, 1, 0);
1066
1067 brw_MOV(p, brw_message_reg(2), src0);
1068 brw_MOV(p, brw_message_reg(3), src1);
1069
1070 brw_math(p,
1071 dst,
1072 BRW_MATH_FUNCTION_POW,
1073 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1074 2,
1075 brw_null_reg(),
1076 BRW_MATH_DATA_VECTOR,
1077 BRW_MATH_PRECISION_FULL);
1078 }
1079
1080 static void emit_lrp(struct brw_wm_compile *c,
1081 struct prog_instruction *inst)
1082 {
1083 struct brw_compile *p = &c->func;
1084 GLuint mask = inst->DstReg.WriteMask;
1085 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1086 int i;
1087 int mark = mark_tmps(c);
1088 for (i = 0; i < 4; i++) {
1089 if (mask & (1<<i)) {
1090 dst = get_dst_reg(c, inst, i);
1091 src0 = get_src_reg(c, inst, 0, i);
1092
1093 src1 = get_src_reg_imm(c, inst, 1, i);
1094
1095 if (src1.nr == dst.nr) {
1096 tmp1 = alloc_tmp(c);
1097 brw_MOV(p, tmp1, src1);
1098 } else
1099 tmp1 = src1;
1100
1101 src2 = get_src_reg(c, inst, 2, i);
1102 if (src2.nr == dst.nr) {
1103 tmp2 = alloc_tmp(c);
1104 brw_MOV(p, tmp2, src2);
1105 } else
1106 tmp2 = src2;
1107
1108 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1109 brw_MUL(p, brw_null_reg(), dst, tmp2);
1110 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1111 brw_MAC(p, dst, src0, tmp1);
1112 brw_set_saturate(p, 0);
1113 }
1114 release_tmps(c, mark);
1115 }
1116 }
1117
1118 /**
1119 * For GLSL shaders, this KIL will be unconditional.
1120 * It may be contained inside an IF/ENDIF structure of course.
1121 */
1122 static void emit_kil(struct brw_wm_compile *c)
1123 {
1124 struct brw_compile *p = &c->func;
1125 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1126 brw_push_insn_state(p);
1127 brw_set_mask_control(p, BRW_MASK_DISABLE);
1128 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1129 brw_AND(p, depth, c->emit_mask_reg, depth);
1130 brw_pop_insn_state(p);
1131 }
1132
1133 static void emit_mad(struct brw_wm_compile *c,
1134 struct prog_instruction *inst)
1135 {
1136 struct brw_compile *p = &c->func;
1137 GLuint mask = inst->DstReg.WriteMask;
1138 struct brw_reg dst, src0, src1, src2;
1139 int i;
1140
1141 for (i = 0; i < 4; i++) {
1142 if (mask & (1<<i)) {
1143 dst = get_dst_reg(c, inst, i);
1144 src0 = get_src_reg(c, inst, 0, i);
1145 src1 = get_src_reg_imm(c, inst, 1, i);
1146 src2 = get_src_reg_imm(c, inst, 2, i);
1147 brw_MUL(p, dst, src0, src1);
1148
1149 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1150 brw_ADD(p, dst, dst, src2);
1151 brw_set_saturate(p, 0);
1152 }
1153 }
1154 }
1155
1156 static void emit_sop(struct brw_wm_compile *c,
1157 struct prog_instruction *inst, GLuint cond)
1158 {
1159 struct brw_compile *p = &c->func;
1160 GLuint mask = inst->DstReg.WriteMask;
1161 struct brw_reg dst, src0, src1;
1162 int i;
1163
1164 for (i = 0; i < 4; i++) {
1165 if (mask & (1<<i)) {
1166 dst = get_dst_reg(c, inst, i);
1167 src0 = get_src_reg(c, inst, 0, i);
1168 src1 = get_src_reg_imm(c, inst, 1, i);
1169 brw_push_insn_state(p);
1170 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1171 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1172 brw_MOV(p, dst, brw_imm_f(0.0));
1173 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1174 brw_MOV(p, dst, brw_imm_f(1.0));
1175 brw_pop_insn_state(p);
1176 }
1177 }
1178 }
1179
1180 static void emit_slt(struct brw_wm_compile *c,
1181 struct prog_instruction *inst)
1182 {
1183 emit_sop(c, inst, BRW_CONDITIONAL_L);
1184 }
1185
1186 static void emit_sle(struct brw_wm_compile *c,
1187 struct prog_instruction *inst)
1188 {
1189 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1190 }
1191
1192 static void emit_sgt(struct brw_wm_compile *c,
1193 struct prog_instruction *inst)
1194 {
1195 emit_sop(c, inst, BRW_CONDITIONAL_G);
1196 }
1197
1198 static void emit_sge(struct brw_wm_compile *c,
1199 struct prog_instruction *inst)
1200 {
1201 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1202 }
1203
1204 static void emit_seq(struct brw_wm_compile *c,
1205 struct prog_instruction *inst)
1206 {
1207 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1208 }
1209
1210 static void emit_sne(struct brw_wm_compile *c,
1211 struct prog_instruction *inst)
1212 {
1213 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1214 }
1215
1216 static void emit_ddx(struct brw_wm_compile *c,
1217 struct prog_instruction *inst)
1218 {
1219 struct brw_compile *p = &c->func;
1220 GLuint mask = inst->DstReg.WriteMask;
1221 struct brw_reg interp[4];
1222 struct brw_reg dst;
1223 struct brw_reg src0, w;
1224 GLuint nr, i;
1225 src0 = get_src_reg(c, inst, 0, 0);
1226 w = get_src_reg(c, inst, 1, 3);
1227 nr = src0.nr;
1228 interp[0] = brw_vec1_grf(nr, 0);
1229 interp[1] = brw_vec1_grf(nr, 4);
1230 interp[2] = brw_vec1_grf(nr+1, 0);
1231 interp[3] = brw_vec1_grf(nr+1, 4);
1232 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1233 for(i = 0; i < 4; i++ ) {
1234 if (mask & (1<<i)) {
1235 dst = get_dst_reg(c, inst, i);
1236 brw_MOV(p, dst, interp[i]);
1237 brw_MUL(p, dst, dst, w);
1238 }
1239 }
1240 brw_set_saturate(p, 0);
1241 }
1242
1243 static void emit_ddy(struct brw_wm_compile *c,
1244 struct prog_instruction *inst)
1245 {
1246 struct brw_compile *p = &c->func;
1247 GLuint mask = inst->DstReg.WriteMask;
1248 struct brw_reg interp[4];
1249 struct brw_reg dst;
1250 struct brw_reg src0, w;
1251 GLuint nr, i;
1252
1253 src0 = get_src_reg(c, inst, 0, 0);
1254 nr = src0.nr;
1255 w = get_src_reg(c, inst, 1, 3);
1256 interp[0] = brw_vec1_grf(nr, 0);
1257 interp[1] = brw_vec1_grf(nr, 4);
1258 interp[2] = brw_vec1_grf(nr+1, 0);
1259 interp[3] = brw_vec1_grf(nr+1, 4);
1260 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1261 for(i = 0; i < 4; i++ ) {
1262 if (mask & (1<<i)) {
1263 dst = get_dst_reg(c, inst, i);
1264 brw_MOV(p, dst, suboffset(interp[i], 1));
1265 brw_MUL(p, dst, dst, w);
1266 }
1267 }
1268 brw_set_saturate(p, 0);
1269 }
1270
1271 static INLINE struct brw_reg high_words( struct brw_reg reg )
1272 {
1273 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1274 0, 8, 2 );
1275 }
1276
1277 static INLINE struct brw_reg low_words( struct brw_reg reg )
1278 {
1279 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1280 }
1281
1282 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1283 {
1284 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1285 }
1286
1287 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1288 {
1289 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1290 0, 16, 2 );
1291 }
1292
1293 /* One-, two- and three-dimensional Perlin noise, similar to the description
1294 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1295 static void noise1_sub( struct brw_wm_compile *c ) {
1296
1297 struct brw_compile *p = &c->func;
1298 struct brw_reg param,
1299 x0, x1, /* gradients at each end */
1300 t, tmp[ 2 ], /* float temporaries */
1301 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1302 int i;
1303 int mark = mark_tmps( c );
1304
1305 x0 = alloc_tmp( c );
1306 x1 = alloc_tmp( c );
1307 t = alloc_tmp( c );
1308 tmp[ 0 ] = alloc_tmp( c );
1309 tmp[ 1 ] = alloc_tmp( c );
1310 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1311 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1312 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1313 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1314 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1315
1316 param = lookup_tmp( c, mark - 2 );
1317
1318 brw_set_access_mode( p, BRW_ALIGN_1 );
1319
1320 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1321
1322 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1323 be hashed. Also compute the remainder (offset within the unit
1324 length), interleaved to reduce register dependency penalties. */
1325 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1326 brw_FRC( p, param, param );
1327 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1328 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1329 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1330
1331 /* We're now ready to perform the hashing. The two hashes are
1332 interleaved for performance. The hash function used is
1333 designed to rapidly achieve avalanche and require only 32x16
1334 bit multiplication, and 16-bit swizzles (which we get for
1335 free). We can't use immediate operands in the multiplies,
1336 because immediates are permitted only in src1 and the 16-bit
1337 factor is permitted only in src0. */
1338 for( i = 0; i < 2; i++ )
1339 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1340 for( i = 0; i < 2; i++ )
1341 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1342 high_words( itmp[ i ] ) );
1343 for( i = 0; i < 2; i++ )
1344 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1345 for( i = 0; i < 2; i++ )
1346 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1347 high_words( itmp[ i ] ) );
1348 for( i = 0; i < 2; i++ )
1349 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1350 for( i = 0; i < 2; i++ )
1351 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1352 high_words( itmp[ i ] ) );
1353
1354 /* Now we want to initialise the two gradients based on the
1355 hashes. Format conversion from signed integer to float leaves
1356 everything scaled too high by a factor of pow( 2, 31 ), but
1357 we correct for that right at the end. */
1358 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1359 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1360 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1361
1362 brw_MUL( p, x0, x0, param );
1363 brw_MUL( p, x1, x1, t );
1364
1365 /* We interpolate between the gradients using the polynomial
1366 6t^5 - 15t^4 + 10t^3 (Perlin). */
1367 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1368 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1369 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1370 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1371 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1372 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1373 pipeline */
1374 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1375 brw_MUL( p, param, tmp[ 0 ], param );
1376 brw_MUL( p, x1, x1, param );
1377 brw_ADD( p, x0, x0, x1 );
1378 /* scale by pow( 2, -30 ), to compensate for the format conversion
1379 above and an extra factor of 2 so that a single gradient covers
1380 the [-1,1] range */
1381 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1382
1383 release_tmps( c, mark );
1384 }
1385
1386 static void emit_noise1( struct brw_wm_compile *c,
1387 struct prog_instruction *inst )
1388 {
1389 struct brw_compile *p = &c->func;
1390 struct brw_reg src, param, dst;
1391 GLuint mask = inst->DstReg.WriteMask;
1392 int i;
1393 int mark = mark_tmps( c );
1394
1395 assert( mark == 0 );
1396
1397 src = get_src_reg( c, inst, 0, 0 );
1398
1399 param = alloc_tmp( c );
1400
1401 brw_MOV( p, param, src );
1402
1403 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1404
1405 /* Fill in the result: */
1406 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1407 for (i = 0 ; i < 4; i++) {
1408 if (mask & (1<<i)) {
1409 dst = get_dst_reg(c, inst, i);
1410 brw_MOV( p, dst, param );
1411 }
1412 }
1413 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1414 brw_set_saturate( p, 0 );
1415
1416 release_tmps( c, mark );
1417 }
1418
1419 static void noise2_sub( struct brw_wm_compile *c ) {
1420
1421 struct brw_compile *p = &c->func;
1422 struct brw_reg param0, param1,
1423 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1424 t, tmp[ 4 ], /* float temporaries */
1425 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1426 int i;
1427 int mark = mark_tmps( c );
1428
1429 x0y0 = alloc_tmp( c );
1430 x0y1 = alloc_tmp( c );
1431 x1y0 = alloc_tmp( c );
1432 x1y1 = alloc_tmp( c );
1433 t = alloc_tmp( c );
1434 for( i = 0; i < 4; i++ ) {
1435 tmp[ i ] = alloc_tmp( c );
1436 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1437 }
1438 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1439 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1440 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1441
1442 param0 = lookup_tmp( c, mark - 3 );
1443 param1 = lookup_tmp( c, mark - 2 );
1444
1445 brw_set_access_mode( p, BRW_ALIGN_1 );
1446
1447 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1448 be hashed. Also compute the remainders (offsets within the unit
1449 square), interleaved to reduce register dependency penalties. */
1450 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1451 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1452 brw_FRC( p, param0, param0 );
1453 brw_FRC( p, param1, param1 );
1454 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1455 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1456 low_words( itmp[ 1 ] ) );
1457 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1458 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1459 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1460 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1461 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1462
1463 /* We're now ready to perform the hashing. The four hashes are
1464 interleaved for performance. The hash function used is
1465 designed to rapidly achieve avalanche and require only 32x16
1466 bit multiplication, and 16-bit swizzles (which we get for
1467 free). We can't use immediate operands in the multiplies,
1468 because immediates are permitted only in src1 and the 16-bit
1469 factor is permitted only in src0. */
1470 for( i = 0; i < 4; i++ )
1471 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1472 for( i = 0; i < 4; i++ )
1473 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1474 high_words( itmp[ i ] ) );
1475 for( i = 0; i < 4; i++ )
1476 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1477 for( i = 0; i < 4; i++ )
1478 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1479 high_words( itmp[ i ] ) );
1480 for( i = 0; i < 4; i++ )
1481 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1482 for( i = 0; i < 4; i++ )
1483 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1484 high_words( itmp[ i ] ) );
1485
1486 /* Now we want to initialise the four gradients based on the
1487 hashes. Format conversion from signed integer to float leaves
1488 everything scaled too high by a factor of pow( 2, 15 ), but
1489 we correct for that right at the end. */
1490 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1491 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1492 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1493 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1494 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1495
1496 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1497 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1498 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1499 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1500
1501 brw_MUL( p, x1y0, x1y0, t );
1502 brw_MUL( p, x1y1, x1y1, t );
1503 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1504 brw_MUL( p, x0y0, x0y0, param0 );
1505 brw_MUL( p, x0y1, x0y1, param0 );
1506
1507 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1508 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1509 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1510 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1511
1512 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1513 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1514 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1515 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1516
1517 /* We interpolate between the gradients using the polynomial
1518 6t^5 - 15t^4 + 10t^3 (Perlin). */
1519 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1520 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1521 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1522 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1523 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1524 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1525 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1526 pipeline */
1527 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1528 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1529 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1530 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1531 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1532 pipeline */
1533 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1534 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1535 brw_MUL( p, param0, tmp[ 0 ], param0 );
1536 brw_MUL( p, param1, tmp[ 1 ], param1 );
1537
1538 /* Here we interpolate in the y dimension... */
1539 brw_MUL( p, x0y1, x0y1, param1 );
1540 brw_MUL( p, x1y1, x1y1, param1 );
1541 brw_ADD( p, x0y0, x0y0, x0y1 );
1542 brw_ADD( p, x1y0, x1y0, x1y1 );
1543
1544 /* And now in x. There are horrible register dependencies here,
1545 but we have nothing else to do. */
1546 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1547 brw_MUL( p, x1y0, x1y0, param0 );
1548 brw_ADD( p, x0y0, x0y0, x1y0 );
1549
1550 /* scale by pow( 2, -15 ), as described above */
1551 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1552
1553 release_tmps( c, mark );
1554 }
1555
1556 static void emit_noise2( struct brw_wm_compile *c,
1557 struct prog_instruction *inst )
1558 {
1559 struct brw_compile *p = &c->func;
1560 struct brw_reg src0, src1, param0, param1, dst;
1561 GLuint mask = inst->DstReg.WriteMask;
1562 int i;
1563 int mark = mark_tmps( c );
1564
1565 assert( mark == 0 );
1566
1567 src0 = get_src_reg( c, inst, 0, 0 );
1568 src1 = get_src_reg( c, inst, 0, 1 );
1569
1570 param0 = alloc_tmp( c );
1571 param1 = alloc_tmp( c );
1572
1573 brw_MOV( p, param0, src0 );
1574 brw_MOV( p, param1, src1 );
1575
1576 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1577
1578 /* Fill in the result: */
1579 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1580 for (i = 0 ; i < 4; i++) {
1581 if (mask & (1<<i)) {
1582 dst = get_dst_reg(c, inst, i);
1583 brw_MOV( p, dst, param0 );
1584 }
1585 }
1586 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1587 brw_set_saturate( p, 0 );
1588
1589 release_tmps( c, mark );
1590 }
1591
1592 /**
1593 * The three-dimensional case is much like the one- and two- versions above,
1594 * but since the number of corners is rapidly growing we now pack 16 16-bit
1595 * hashes into each register to extract more parallelism from the EUs.
1596 */
1597 static void noise3_sub( struct brw_wm_compile *c ) {
1598
1599 struct brw_compile *p = &c->func;
1600 struct brw_reg param0, param1, param2,
1601 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1602 xi, yi, zi, /* interpolation coefficients */
1603 t, tmp[ 8 ], /* float temporaries */
1604 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1605 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1606 int i;
1607 int mark = mark_tmps( c );
1608
1609 x0y0 = alloc_tmp( c );
1610 x0y1 = alloc_tmp( c );
1611 x1y0 = alloc_tmp( c );
1612 x1y1 = alloc_tmp( c );
1613 xi = alloc_tmp( c );
1614 yi = alloc_tmp( c );
1615 zi = alloc_tmp( c );
1616 t = alloc_tmp( c );
1617 for( i = 0; i < 8; i++ ) {
1618 tmp[ i ] = alloc_tmp( c );
1619 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1620 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1621 }
1622
1623 param0 = lookup_tmp( c, mark - 4 );
1624 param1 = lookup_tmp( c, mark - 3 );
1625 param2 = lookup_tmp( c, mark - 2 );
1626
1627 brw_set_access_mode( p, BRW_ALIGN_1 );
1628
1629 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1630 be hashed. Also compute the remainders (offsets within the unit
1631 cube), interleaved to reduce register dependency penalties. */
1632 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1633 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1634 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1635 brw_FRC( p, param0, param0 );
1636 brw_FRC( p, param1, param1 );
1637 brw_FRC( p, param2, param2 );
1638 /* Since we now have only 16 bits of precision in the hash, we must
1639 be more careful about thorough mixing to maintain entropy as we
1640 squash the input vector into a small scalar. */
1641 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1642 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1643 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1644 brw_imm_uw( 0x9B93 ) );
1645 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1646 brw_imm_uw( 0xBC8F ) );
1647
1648 /* Temporarily disable the execution mask while we work with ExecSize=16
1649 channels (the mask is set for ExecSize=8 and is probably incorrect).
1650 Although this might cause execution of unwanted channels, the code
1651 writes only to temporary registers and has no side effects, so
1652 disabling the mask is harmless. */
1653 brw_push_insn_state( p );
1654 brw_set_mask_control( p, BRW_MASK_DISABLE );
1655 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1656 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1657 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1658
1659 /* We're now ready to perform the hashing. The eight hashes are
1660 interleaved for performance. The hash function used is
1661 designed to rapidly achieve avalanche and require only 16x16
1662 bit multiplication, and 8-bit swizzles (which we get for
1663 free). */
1664 for( i = 0; i < 4; i++ )
1665 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1666 for( i = 0; i < 4; i++ )
1667 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1668 odd_bytes( wtmp[ i ] ) );
1669 for( i = 0; i < 4; i++ )
1670 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1671 for( i = 0; i < 4; i++ )
1672 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1673 odd_bytes( wtmp[ i ] ) );
1674 brw_pop_insn_state( p );
1675
1676 /* Now we want to initialise the four rear gradients based on the
1677 hashes. Format conversion from signed integer to float leaves
1678 everything scaled too high by a factor of pow( 2, 15 ), but
1679 we correct for that right at the end. */
1680 /* x component */
1681 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1682 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1683 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1684 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1685 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1686
1687 brw_push_insn_state( p );
1688 brw_set_mask_control( p, BRW_MASK_DISABLE );
1689 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1690 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1691 brw_pop_insn_state( p );
1692
1693 brw_MUL( p, x1y0, x1y0, t );
1694 brw_MUL( p, x1y1, x1y1, t );
1695 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1696 brw_MUL( p, x0y0, x0y0, param0 );
1697 brw_MUL( p, x0y1, x0y1, param0 );
1698
1699 /* y component */
1700 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1701 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1702 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1703 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1704
1705 brw_push_insn_state( p );
1706 brw_set_mask_control( p, BRW_MASK_DISABLE );
1707 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1708 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1709 brw_pop_insn_state( p );
1710
1711 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1712 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1713 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1714 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1715 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1716
1717 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1718 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1719 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1720 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1721
1722 /* z component */
1723 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1724 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1725 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1726 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1727
1728 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1729 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1730 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1731 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1732
1733 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1734 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1735 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1736 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1737
1738 /* We interpolate between the gradients using the polynomial
1739 6t^5 - 15t^4 + 10t^3 (Perlin). */
1740 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1741 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1742 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1743 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1744 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1745 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1746 brw_MUL( p, xi, xi, param0 );
1747 brw_MUL( p, yi, yi, param1 );
1748 brw_MUL( p, zi, zi, param2 );
1749 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1750 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1751 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1752 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1753 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1754 brw_MUL( p, xi, xi, param0 );
1755 brw_MUL( p, yi, yi, param1 );
1756 brw_MUL( p, zi, zi, param2 );
1757 brw_MUL( p, xi, xi, param0 );
1758 brw_MUL( p, yi, yi, param1 );
1759 brw_MUL( p, zi, zi, param2 );
1760 brw_MUL( p, xi, xi, param0 );
1761 brw_MUL( p, yi, yi, param1 );
1762 brw_MUL( p, zi, zi, param2 );
1763
1764 /* Here we interpolate in the y dimension... */
1765 brw_MUL( p, x0y1, x0y1, yi );
1766 brw_MUL( p, x1y1, x1y1, yi );
1767 brw_ADD( p, x0y0, x0y0, x0y1 );
1768 brw_ADD( p, x1y0, x1y0, x1y1 );
1769
1770 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1771 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1772 brw_MUL( p, x1y0, x1y0, xi );
1773 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1774
1775 /* Now do the same thing for the front four gradients... */
1776 /* x component */
1777 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1778 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1779 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1780 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1781
1782 brw_push_insn_state( p );
1783 brw_set_mask_control( p, BRW_MASK_DISABLE );
1784 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1785 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1786 brw_pop_insn_state( p );
1787
1788 brw_MUL( p, x1y0, x1y0, t );
1789 brw_MUL( p, x1y1, x1y1, t );
1790 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1791 brw_MUL( p, x0y0, x0y0, param0 );
1792 brw_MUL( p, x0y1, x0y1, param0 );
1793
1794 /* y component */
1795 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1796 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1797 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1798 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1799
1800 brw_push_insn_state( p );
1801 brw_set_mask_control( p, BRW_MASK_DISABLE );
1802 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1803 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1804 brw_pop_insn_state( p );
1805
1806 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1807 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1808 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1809 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1810 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1811
1812 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1813 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1814 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1815 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1816
1817 /* z component */
1818 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1819 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1820 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1821 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1822
1823 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1824 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1825 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1826 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1827
1828 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1829 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1830 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1831 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1832
1833 /* The interpolation coefficients are still around from last time, so
1834 again interpolate in the y dimension... */
1835 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1836 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1837 brw_MUL( p, x0y1, x0y1, yi );
1838 brw_MUL( p, x1y1, x1y1, yi );
1839 brw_ADD( p, x0y0, x0y0, x0y1 );
1840 brw_ADD( p, x1y0, x1y0, x1y1 );
1841
1842 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1843 time put the front face in tmp[ 1 ] and we're nearly there... */
1844 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1845 brw_MUL( p, x1y0, x1y0, xi );
1846 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1847
1848 /* The final interpolation, in the z dimension: */
1849 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1850 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1851 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1852
1853 /* scale by pow( 2, -15 ), as described above */
1854 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1855
1856 release_tmps( c, mark );
1857 }
1858
1859 static void emit_noise3( struct brw_wm_compile *c,
1860 struct prog_instruction *inst )
1861 {
1862 struct brw_compile *p = &c->func;
1863 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1864 GLuint mask = inst->DstReg.WriteMask;
1865 int i;
1866 int mark = mark_tmps( c );
1867
1868 assert( mark == 0 );
1869
1870 src0 = get_src_reg( c, inst, 0, 0 );
1871 src1 = get_src_reg( c, inst, 0, 1 );
1872 src2 = get_src_reg( c, inst, 0, 2 );
1873
1874 param0 = alloc_tmp( c );
1875 param1 = alloc_tmp( c );
1876 param2 = alloc_tmp( c );
1877
1878 brw_MOV( p, param0, src0 );
1879 brw_MOV( p, param1, src1 );
1880 brw_MOV( p, param2, src2 );
1881
1882 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1883
1884 /* Fill in the result: */
1885 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1886 for (i = 0 ; i < 4; i++) {
1887 if (mask & (1<<i)) {
1888 dst = get_dst_reg(c, inst, i);
1889 brw_MOV( p, dst, param0 );
1890 }
1891 }
1892 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1893 brw_set_saturate( p, 0 );
1894
1895 release_tmps( c, mark );
1896 }
1897
1898 /**
1899 * For the four-dimensional case, the little micro-optimisation benefits
1900 * we obtain by unrolling all the loops aren't worth the massive bloat it
1901 * now causes. Instead, we loop twice around performing a similar operation
1902 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1903 * code to glue it all together.
1904 */
1905 static void noise4_sub( struct brw_wm_compile *c )
1906 {
1907 struct brw_compile *p = &c->func;
1908 struct brw_reg param[ 4 ],
1909 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1910 w0, /* noise for the w=0 cube */
1911 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1912 interp[ 4 ], /* interpolation coefficients */
1913 t, tmp[ 8 ], /* float temporaries */
1914 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1915 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1916 int i, j;
1917 int mark = mark_tmps( c );
1918 GLuint loop, origin;
1919
1920 x0y0 = alloc_tmp( c );
1921 x0y1 = alloc_tmp( c );
1922 x1y0 = alloc_tmp( c );
1923 x1y1 = alloc_tmp( c );
1924 t = alloc_tmp( c );
1925 w0 = alloc_tmp( c );
1926 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1927 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1928
1929 for( i = 0; i < 4; i++ ) {
1930 param[ i ] = lookup_tmp( c, mark - 5 + i );
1931 interp[ i ] = alloc_tmp( c );
1932 }
1933
1934 for( i = 0; i < 8; i++ ) {
1935 tmp[ i ] = alloc_tmp( c );
1936 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1937 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1938 }
1939
1940 brw_set_access_mode( p, BRW_ALIGN_1 );
1941
1942 /* We only want 16 bits of precision from the integral part of each
1943 co-ordinate, but unfortunately the RNDD semantics would saturate
1944 at 16 bits if we performed the operation directly to a 16-bit
1945 destination. Therefore, we round to 32-bit temporaries where
1946 appropriate, and then store only the lower 16 bits. */
1947 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1948 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1949 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1950 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1951 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1952 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1953
1954 /* Modify the flag register here, because the side effect is useful
1955 later (see below). We know for certain that all flags will be
1956 cleared, since the FRC instruction cannot possibly generate
1957 negative results. Even for exceptional inputs (infinities, denormals,
1958 NaNs), the architecture guarantees that the L conditional is false. */
1959 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1960 brw_FRC( p, param[ 0 ], param[ 0 ] );
1961 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1962 for( i = 1; i < 4; i++ )
1963 brw_FRC( p, param[ i ], param[ i ] );
1964
1965 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1966 of all. */
1967 for( i = 0; i < 4; i++ )
1968 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1969 for( i = 0; i < 4; i++ )
1970 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1971 for( i = 0; i < 4; i++ )
1972 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1973 for( i = 0; i < 4; i++ )
1974 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1975 for( j = 0; j < 3; j++ )
1976 for( i = 0; i < 4; i++ )
1977 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1978
1979 /* Mark the current address, as it will be a jump destination. The
1980 following code will be executed twice: first, with the flag
1981 register clear indicating the w=0 case, and second with flags
1982 set for w=1. */
1983 loop = p->nr_insn;
1984
1985 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1986 be hashed. Since we have only 16 bits of precision in the hash, we
1987 must be careful about thorough mixing to maintain entropy as we
1988 squash the input vector into a small scalar. */
1989 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1990 brw_imm_uw( 0xBC8F ) );
1991 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1992 brw_imm_uw( 0xD0BD ) );
1993 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1994 brw_imm_uw( 0x9B93 ) );
1995 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1996 brw_imm_uw( 0xA359 ) );
1997 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1998 brw_imm_uw( 0xBC8F ) );
1999
2000 /* Temporarily disable the execution mask while we work with ExecSize=16
2001 channels (the mask is set for ExecSize=8 and is probably incorrect).
2002 Although this might cause execution of unwanted channels, the code
2003 writes only to temporary registers and has no side effects, so
2004 disabling the mask is harmless. */
2005 brw_push_insn_state( p );
2006 brw_set_mask_control( p, BRW_MASK_DISABLE );
2007 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2008 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2009 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2010
2011 /* We're now ready to perform the hashing. The eight hashes are
2012 interleaved for performance. The hash function used is
2013 designed to rapidly achieve avalanche and require only 16x16
2014 bit multiplication, and 8-bit swizzles (which we get for
2015 free). */
2016 for( i = 0; i < 4; i++ )
2017 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2018 for( i = 0; i < 4; i++ )
2019 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2020 odd_bytes( wtmp[ i ] ) );
2021 for( i = 0; i < 4; i++ )
2022 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2023 for( i = 0; i < 4; i++ )
2024 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2025 odd_bytes( wtmp[ i ] ) );
2026 brw_pop_insn_state( p );
2027
2028 /* Now we want to initialise the four rear gradients based on the
2029 hashes. Format conversion from signed integer to float leaves
2030 everything scaled too high by a factor of pow( 2, 15 ), but
2031 we correct for that right at the end. */
2032 /* x component */
2033 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2034 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2035 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2036 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2037 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2038
2039 brw_push_insn_state( p );
2040 brw_set_mask_control( p, BRW_MASK_DISABLE );
2041 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2042 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2043 brw_pop_insn_state( p );
2044
2045 brw_MUL( p, x1y0, x1y0, t );
2046 brw_MUL( p, x1y1, x1y1, t );
2047 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2048 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2049 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2050
2051 /* y component */
2052 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2053 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2054 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2055 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2056
2057 brw_push_insn_state( p );
2058 brw_set_mask_control( p, BRW_MASK_DISABLE );
2059 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2060 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2061 brw_pop_insn_state( p );
2062
2063 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2064 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2065 /* prepare t for the w component (used below): w the first time through
2066 the loop; w - 1 the second time) */
2067 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2068 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2069 p->current->header.predicate_inverse = 1;
2070 brw_MOV( p, t, param[ 3 ] );
2071 p->current->header.predicate_inverse = 0;
2072 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2073 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2074 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2075
2076 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2077 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2078 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2079 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2080
2081 /* z component */
2082 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2083 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2084 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2085 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2086
2087 brw_push_insn_state( p );
2088 brw_set_mask_control( p, BRW_MASK_DISABLE );
2089 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2090 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2091 brw_pop_insn_state( p );
2092
2093 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2094 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2095 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2096 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2097
2098 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2099 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2100 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2101 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2102
2103 /* w component */
2104 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2105 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2106 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2107 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2108
2109 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2110 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2111 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2112 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2113 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2114
2115 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2116 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2117 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2118 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2119
2120 /* Here we interpolate in the y dimension... */
2121 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2122 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2123 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2124 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2125 brw_ADD( p, x0y0, x0y0, x0y1 );
2126 brw_ADD( p, x1y0, x1y0, x1y1 );
2127
2128 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2129 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2130 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2131 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2132
2133 /* Now do the same thing for the front four gradients... */
2134 /* x component */
2135 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2136 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2137 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2138 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2139
2140 brw_push_insn_state( p );
2141 brw_set_mask_control( p, BRW_MASK_DISABLE );
2142 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2143 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2144 brw_pop_insn_state( p );
2145
2146 brw_MUL( p, x1y0, x1y0, t );
2147 brw_MUL( p, x1y1, x1y1, t );
2148 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2149 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2150 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2151
2152 /* y component */
2153 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2154 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2155 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2156 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2157
2158 brw_push_insn_state( p );
2159 brw_set_mask_control( p, BRW_MASK_DISABLE );
2160 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2161 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2162 brw_pop_insn_state( p );
2163
2164 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2165 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2166 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2167 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2168 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2169
2170 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2171 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2172 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2173 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2174
2175 /* z component */
2176 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2177 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2178 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2179 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2180
2181 brw_push_insn_state( p );
2182 brw_set_mask_control( p, BRW_MASK_DISABLE );
2183 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2184 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2185 brw_pop_insn_state( p );
2186
2187 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2188 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2189 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2190 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2191 /* prepare t for the w component (used below): w the first time through
2192 the loop; w - 1 the second time) */
2193 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2194 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2195 p->current->header.predicate_inverse = 1;
2196 brw_MOV( p, t, param[ 3 ] );
2197 p->current->header.predicate_inverse = 0;
2198 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2199
2200 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2201 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2202 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2203 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2204
2205 /* w component */
2206 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2207 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2208 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2209 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2210
2211 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2212 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2213 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2214 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2215
2216 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2217 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2218 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2219 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2220
2221 /* Interpolate in the y dimension: */
2222 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2223 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2224 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2225 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2226 brw_ADD( p, x0y0, x0y0, x0y1 );
2227 brw_ADD( p, x1y0, x1y0, x1y1 );
2228
2229 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2230 time put the front face in tmp[ 1 ] and we're nearly there... */
2231 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2232 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2233 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2234
2235 /* Another interpolation, in the z dimension: */
2236 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2237 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2238 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2239
2240 /* Exit the loop if we've computed both cubes... */
2241 origin = p->nr_insn;
2242 brw_push_insn_state( p );
2243 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2244 brw_set_mask_control( p, BRW_MASK_DISABLE );
2245 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2246 brw_pop_insn_state( p );
2247
2248 /* Save the result for the w=0 case, and increment the w coordinate: */
2249 brw_MOV( p, w0, tmp[ 0 ] );
2250 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2251 brw_imm_uw( 1 ) );
2252
2253 /* Loop around for the other cube. Explicitly set the flag register
2254 (unfortunately we must spend an extra instruction to do this: we
2255 can't rely on a side effect of the previous MOV or ADD because
2256 conditional modifiers which are normally true might be false in
2257 exceptional circumstances, e.g. given a NaN input; the add to
2258 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2259 brw_push_insn_state( p );
2260 brw_set_mask_control( p, BRW_MASK_DISABLE );
2261 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2262 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2263 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2264 brw_pop_insn_state( p );
2265
2266 /* Patch the previous conditional branch now that we know the
2267 destination address. */
2268 brw_set_src1( p->store + origin,
2269 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2270
2271 /* The very last interpolation. */
2272 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2273 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2274 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2275
2276 /* scale by pow( 2, -15 ), as described above */
2277 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2278
2279 release_tmps( c, mark );
2280 }
2281
2282 static void emit_noise4( struct brw_wm_compile *c,
2283 struct prog_instruction *inst )
2284 {
2285 struct brw_compile *p = &c->func;
2286 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2287 GLuint mask = inst->DstReg.WriteMask;
2288 int i;
2289 int mark = mark_tmps( c );
2290
2291 assert( mark == 0 );
2292
2293 src0 = get_src_reg( c, inst, 0, 0 );
2294 src1 = get_src_reg( c, inst, 0, 1 );
2295 src2 = get_src_reg( c, inst, 0, 2 );
2296 src3 = get_src_reg( c, inst, 0, 3 );
2297
2298 param0 = alloc_tmp( c );
2299 param1 = alloc_tmp( c );
2300 param2 = alloc_tmp( c );
2301 param3 = alloc_tmp( c );
2302
2303 brw_MOV( p, param0, src0 );
2304 brw_MOV( p, param1, src1 );
2305 brw_MOV( p, param2, src2 );
2306 brw_MOV( p, param3, src3 );
2307
2308 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2309
2310 /* Fill in the result: */
2311 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2312 for (i = 0 ; i < 4; i++) {
2313 if (mask & (1<<i)) {
2314 dst = get_dst_reg(c, inst, i);
2315 brw_MOV( p, dst, param0 );
2316 }
2317 }
2318 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2319 brw_set_saturate( p, 0 );
2320
2321 release_tmps( c, mark );
2322 }
2323
2324 static void emit_wpos_xy(struct brw_wm_compile *c,
2325 struct prog_instruction *inst)
2326 {
2327 struct brw_compile *p = &c->func;
2328 GLuint mask = inst->DstReg.WriteMask;
2329 struct brw_reg src0[2], dst[2];
2330
2331 dst[0] = get_dst_reg(c, inst, 0);
2332 dst[1] = get_dst_reg(c, inst, 1);
2333
2334 src0[0] = get_src_reg(c, inst, 0, 0);
2335 src0[1] = get_src_reg(c, inst, 0, 1);
2336
2337 /* Calculate the pixel offset from window bottom left into destination
2338 * X and Y channels.
2339 */
2340 if (mask & WRITEMASK_X) {
2341 /* X' = X - origin_x */
2342 brw_ADD(p,
2343 dst[0],
2344 retype(src0[0], BRW_REGISTER_TYPE_W),
2345 brw_imm_d(0 - c->key.origin_x));
2346 }
2347
2348 if (mask & WRITEMASK_Y) {
2349 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2350 brw_ADD(p,
2351 dst[1],
2352 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2353 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2354 }
2355 }
2356
2357 /* TODO
2358 BIAS on SIMD8 not working yet...
2359 */
2360 static void emit_txb(struct brw_wm_compile *c,
2361 struct prog_instruction *inst)
2362 {
2363 struct brw_compile *p = &c->func;
2364 struct brw_reg dst[4], src[4], payload_reg;
2365 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2366 GLuint i;
2367
2368 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2369
2370 for (i = 0; i < 4; i++)
2371 dst[i] = get_dst_reg(c, inst, i);
2372 for (i = 0; i < 4; i++)
2373 src[i] = get_src_reg(c, inst, 0, i);
2374
2375 switch (inst->TexSrcTarget) {
2376 case TEXTURE_1D_INDEX:
2377 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2378 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2379 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2380 break;
2381 case TEXTURE_2D_INDEX:
2382 case TEXTURE_RECT_INDEX:
2383 brw_MOV(p, brw_message_reg(2), src[0]);
2384 brw_MOV(p, brw_message_reg(3), src[1]);
2385 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2386 break;
2387 default:
2388 brw_MOV(p, brw_message_reg(2), src[0]);
2389 brw_MOV(p, brw_message_reg(3), src[1]);
2390 brw_MOV(p, brw_message_reg(4), src[2]);
2391 break;
2392 }
2393 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2394 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2395 brw_SAMPLE(p,
2396 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2397 1, /* msg_reg_nr */
2398 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2399 unit + MAX_DRAW_BUFFERS, /* surface */
2400 unit, /* sampler */
2401 inst->DstReg.WriteMask, /* writemask */
2402 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2403 4, /* response_length */
2404 4, /* msg_length */
2405 0); /* eot */
2406 }
2407
2408
2409 static void emit_tex(struct brw_wm_compile *c,
2410 struct prog_instruction *inst)
2411 {
2412 struct brw_compile *p = &c->func;
2413 struct brw_reg dst[4], src[4], payload_reg;
2414 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2415 GLuint msg_len;
2416 GLuint i, nr;
2417 GLuint emit;
2418 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2419
2420 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2421
2422 for (i = 0; i < 4; i++)
2423 dst[i] = get_dst_reg(c, inst, i);
2424 for (i = 0; i < 4; i++)
2425 src[i] = get_src_reg(c, inst, 0, i);
2426
2427 switch (inst->TexSrcTarget) {
2428 case TEXTURE_1D_INDEX:
2429 emit = WRITEMASK_X;
2430 nr = 1;
2431 break;
2432 case TEXTURE_2D_INDEX:
2433 case TEXTURE_RECT_INDEX:
2434 emit = WRITEMASK_XY;
2435 nr = 2;
2436 break;
2437 default:
2438 emit = WRITEMASK_XYZ;
2439 nr = 3;
2440 break;
2441 }
2442 msg_len = 1;
2443
2444 /* move/load S, T, R coords */
2445 for (i = 0; i < nr; i++) {
2446 static const GLuint swz[4] = {0,1,2,2};
2447 if (emit & (1<<i))
2448 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2449 else
2450 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2451 msg_len += 1;
2452 }
2453
2454 if (shadow) {
2455 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2456 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2457 }
2458
2459 brw_SAMPLE(p,
2460 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2461 1, /* msg_reg_nr */
2462 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2463 unit + MAX_DRAW_BUFFERS, /* surface */
2464 unit, /* sampler */
2465 inst->DstReg.WriteMask, /* writemask */
2466 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2467 4, /* response_length */
2468 shadow ? 6 : 4, /* msg_length */
2469 0); /* eot */
2470
2471 if (shadow)
2472 brw_MOV(p, dst[3], brw_imm_f(1.0));
2473 }
2474
2475
2476 static void emit_get_constant(struct brw_context *brw,
2477 struct brw_wm_compile *c,
2478 struct prog_instruction *inst,
2479 GLuint constIndex)
2480 {
2481 struct brw_compile *p = &c->func;
2482 struct brw_reg dst[4];
2483 GLuint i;
2484 const int mark = mark_tmps( c );
2485 struct brw_reg writeback_reg[4];
2486
2487 /* XXX only need 1 temp reg??? */
2488 for (i = 0; i < 4; i++) {
2489 writeback_reg[i] = alloc_tmp(c);
2490 }
2491
2492 for (i = 0; i < 4; i++) {
2493 dst[i] = get_dst_reg(c, inst, i);
2494 }
2495
2496 /* Get float[4] vector from constant buffer */
2497 brw_dp_READ_4(p,
2498 writeback_reg[0], /* first writeback dest */
2499 1, /* msg_reg */
2500 GL_FALSE, /* rel addr? */
2501 16 * constIndex, /* byte offset */
2502 BRW_WM_MAX_SURF - 1 /* surface, binding table index */
2503 );
2504
2505 /* Extract the four channel values, smear across dest registers */
2506 for (i = 0; i < 4; i++) {
2507 /* extract 1 float from the writeback reg */
2508 struct brw_reg new_src = stride(writeback_reg[0], 0, 1, 0);
2509 new_src.subnr = i * 4;
2510 /* and smear it into the dest register */
2511 brw_MOV(p, dst[i], new_src);
2512 }
2513
2514 release_tmps( c, mark );
2515 }
2516
2517
2518 /**
2519 * Resolve subroutine calls after code emit is done.
2520 */
2521 static void post_wm_emit( struct brw_wm_compile *c )
2522 {
2523 brw_resolve_cals(&c->func);
2524 }
2525
2526 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2527 {
2528 #define MAX_IFSN 32
2529 #define MAX_LOOP_DEPTH 32
2530 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2531 struct brw_instruction *inst0, *inst1;
2532 int i, if_insn = 0, loop_insn = 0;
2533 struct brw_compile *p = &c->func;
2534 struct brw_indirect stack_index = brw_indirect(0, 0);
2535
2536 c->reg_index = 0;
2537 prealloc_reg(c);
2538 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2539 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2540
2541 for (i = 0; i < c->nr_fp_insns; i++) {
2542 struct prog_instruction *inst = &c->prog_instructions[i];
2543
2544 if (inst->CondUpdate)
2545 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2546 else
2547 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2548
2549 switch (inst->Opcode) {
2550 case WM_PIXELXY:
2551 emit_pixel_xy(c, inst);
2552 break;
2553 case WM_DELTAXY:
2554 emit_delta_xy(c, inst);
2555 break;
2556 case WM_PIXELW:
2557 emit_pixel_w(c, inst);
2558 break;
2559 case WM_LINTERP:
2560 emit_linterp(c, inst);
2561 break;
2562 case WM_PINTERP:
2563 emit_pinterp(c, inst);
2564 break;
2565 case WM_CINTERP:
2566 emit_cinterp(c, inst);
2567 break;
2568 case WM_WPOSXY:
2569 emit_wpos_xy(c, inst);
2570 break;
2571 case WM_FB_WRITE:
2572 emit_fb_write(c, inst);
2573 break;
2574 case WM_FRONTFACING:
2575 emit_frontfacing(c, inst);
2576 break;
2577 case OPCODE_ABS:
2578 emit_abs(c, inst);
2579 break;
2580 case OPCODE_ADD:
2581 emit_add(c, inst);
2582 break;
2583 case OPCODE_ARL:
2584 emit_arl(c, inst);
2585 break;
2586 case OPCODE_SUB:
2587 emit_sub(c, inst);
2588 break;
2589 case OPCODE_FRC:
2590 emit_frc(c, inst);
2591 break;
2592 case OPCODE_FLR:
2593 emit_flr(c, inst);
2594 break;
2595 case OPCODE_LRP:
2596 emit_lrp(c, inst);
2597 break;
2598 case OPCODE_TRUNC:
2599 emit_trunc(c, inst);
2600 break;
2601 case OPCODE_MOV:
2602 #if 0
2603 /* test hook for new constant buffer code */
2604 if (inst->SrcReg[0].File == PROGRAM_UNIFORM) {
2605 emit_get_constant(brw, c, inst, inst->SrcReg[0].Index);
2606 }
2607 else {
2608 emit_mov(c, inst);
2609 }
2610 #else
2611 emit_mov(c, inst);
2612 #endif
2613 break;
2614 case OPCODE_DP3:
2615 emit_dp3(c, inst);
2616 break;
2617 case OPCODE_DP4:
2618 emit_dp4(c, inst);
2619 break;
2620 case OPCODE_XPD:
2621 emit_xpd(c, inst);
2622 break;
2623 case OPCODE_DPH:
2624 emit_dph(c, inst);
2625 break;
2626 case OPCODE_RCP:
2627 emit_rcp(c, inst);
2628 break;
2629 case OPCODE_RSQ:
2630 emit_rsq(c, inst);
2631 break;
2632 case OPCODE_SIN:
2633 emit_sin(c, inst);
2634 break;
2635 case OPCODE_COS:
2636 emit_cos(c, inst);
2637 break;
2638 case OPCODE_EX2:
2639 emit_ex2(c, inst);
2640 break;
2641 case OPCODE_LG2:
2642 emit_lg2(c, inst);
2643 break;
2644 case OPCODE_MAX:
2645 emit_max(c, inst);
2646 break;
2647 case OPCODE_MIN:
2648 emit_min(c, inst);
2649 break;
2650 case OPCODE_DDX:
2651 emit_ddx(c, inst);
2652 break;
2653 case OPCODE_DDY:
2654 emit_ddy(c, inst);
2655 break;
2656 case OPCODE_SLT:
2657 emit_slt(c, inst);
2658 break;
2659 case OPCODE_SLE:
2660 emit_sle(c, inst);
2661 break;
2662 case OPCODE_SGT:
2663 emit_sgt(c, inst);
2664 break;
2665 case OPCODE_SGE:
2666 emit_sge(c, inst);
2667 break;
2668 case OPCODE_SEQ:
2669 emit_seq(c, inst);
2670 break;
2671 case OPCODE_SNE:
2672 emit_sne(c, inst);
2673 break;
2674 case OPCODE_MUL:
2675 emit_mul(c, inst);
2676 break;
2677 case OPCODE_POW:
2678 emit_pow(c, inst);
2679 break;
2680 case OPCODE_MAD:
2681 emit_mad(c, inst);
2682 break;
2683 case OPCODE_NOISE1:
2684 emit_noise1(c, inst);
2685 break;
2686 case OPCODE_NOISE2:
2687 emit_noise2(c, inst);
2688 break;
2689 case OPCODE_NOISE3:
2690 emit_noise3(c, inst);
2691 break;
2692 case OPCODE_NOISE4:
2693 emit_noise4(c, inst);
2694 break;
2695 case OPCODE_TEX:
2696 emit_tex(c, inst);
2697 break;
2698 case OPCODE_TXB:
2699 emit_txb(c, inst);
2700 break;
2701 case OPCODE_KIL_NV:
2702 emit_kil(c);
2703 break;
2704 case OPCODE_IF:
2705 assert(if_insn < MAX_IFSN);
2706 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2707 break;
2708 case OPCODE_ELSE:
2709 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2710 break;
2711 case OPCODE_ENDIF:
2712 assert(if_insn > 0);
2713 brw_ENDIF(p, if_inst[--if_insn]);
2714 break;
2715 case OPCODE_BGNSUB:
2716 brw_save_label(p, inst->Comment, p->nr_insn);
2717 break;
2718 case OPCODE_ENDSUB:
2719 /* no-op */
2720 break;
2721 case OPCODE_CAL:
2722 brw_push_insn_state(p);
2723 brw_set_mask_control(p, BRW_MASK_DISABLE);
2724 brw_set_access_mode(p, BRW_ALIGN_1);
2725 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2726 brw_set_access_mode(p, BRW_ALIGN_16);
2727 brw_ADD(p, get_addr_reg(stack_index),
2728 get_addr_reg(stack_index), brw_imm_d(4));
2729 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2730 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2731 brw_pop_insn_state(p);
2732 break;
2733
2734 case OPCODE_RET:
2735 brw_push_insn_state(p);
2736 brw_set_mask_control(p, BRW_MASK_DISABLE);
2737 brw_ADD(p, get_addr_reg(stack_index),
2738 get_addr_reg(stack_index), brw_imm_d(-4));
2739 brw_set_access_mode(p, BRW_ALIGN_1);
2740 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2741 brw_set_access_mode(p, BRW_ALIGN_16);
2742 brw_pop_insn_state(p);
2743
2744 break;
2745 case OPCODE_BGNLOOP:
2746 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2747 break;
2748 case OPCODE_BRK:
2749 brw_BREAK(p);
2750 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2751 break;
2752 case OPCODE_CONT:
2753 brw_CONT(p);
2754 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2755 break;
2756 case OPCODE_ENDLOOP:
2757 loop_insn--;
2758 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2759 /* patch all the BREAK instructions from
2760 last BEGINLOOP */
2761 while (inst0 > loop_inst[loop_insn]) {
2762 inst0--;
2763 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2764 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2765 inst0->bits3.if_else.pop_count = 0;
2766 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2767 inst0->bits3.if_else.jump_count = inst1 - inst0;
2768 inst0->bits3.if_else.pop_count = 0;
2769 }
2770 }
2771 break;
2772 default:
2773 _mesa_printf("unsupported IR in fragment shader %d\n",
2774 inst->Opcode);
2775 }
2776 if (inst->CondUpdate)
2777 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2778 else
2779 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2780 }
2781 post_wm_emit(c);
2782
2783 if (c->reg_index >= BRW_WM_MAX_GRF) {
2784 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2785 /* XXX we need to do some proper error recovery here */
2786 }
2787 }
2788
2789
2790 /**
2791 * Do GPU code generation for shaders that use GLSL features such as
2792 * flow control. Other shaders will be compiled with the
2793 */
2794 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2795 {
2796 if (INTEL_DEBUG & DEBUG_WM) {
2797 _mesa_printf("brw_wm_glsl_emit:\n");
2798 }
2799
2800 /* initial instruction translation/simplification */
2801 brw_wm_pass_fp(c);
2802
2803 /* actual code generation */
2804 brw_wm_emit_glsl(brw, c);
2805
2806 if (INTEL_DEBUG & DEBUG_WM) {
2807 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2808 }
2809
2810 c->prog_data.total_grf = c->reg_index;
2811 c->prog_data.total_scratch = 0;
2812 }