4cf092226cf2bda902bcfc845508ec3015662e62
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195 const struct gl_program_parameter_list *plist =
196 c->fp->program.Base.Parameters;
197 int index = 0;
198
199 /* number of float constants */
200 c->prog_data.nr_params = 4 * nr_params;
201
202 /* loop over program constants (float[4]) */
203 for (i = 0; i < nr_params; i++) {
204 /* loop over XYZW channels */
205 for (j = 0; j < 4; j++, index++) {
206 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
207 /* Save pointer to parameter/constant value.
208 * Constants will be copied in prepare_constant_buffer()
209 */
210 c->prog_data.param[index] = &plist->ParameterValues[i][j];
211 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
212 }
213 }
214 /* number of constant regs used (each reg is float[8]) */
215 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
216 c->reg_index += c->nr_creg;
217 }
218
219 /* fragment shader inputs */
220 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
221 if (inputs & (1<<i)) {
222 nr_interp_regs++;
223 reg = brw_vec8_grf(c->reg_index, 0);
224 for (j = 0; j < 4; j++)
225 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
226 c->reg_index += 2;
227 }
228 }
229
230 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
231 c->prog_data.urb_read_length = nr_interp_regs * 2;
232 c->prog_data.curb_read_length = c->nr_creg;
233 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
234 c->reg_index++;
235 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
236 c->reg_index += 2;
237 }
238
239
240 /**
241 * Convert Mesa dst register to brw register.
242 */
243 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
244 struct prog_instruction *inst, int component, int nr)
245 {
246 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
247 0, 0);
248 }
249
250
251 /**
252 * Convert Mesa src register to brw register.
253 */
254 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
255 struct prog_src_register *src, int index, int nr)
256 {
257 int component = GET_SWZ(src->Swizzle, index);
258 return get_reg(c, src->File, src->Index, component, nr,
259 src->NegateBase, src->Abs);
260 }
261
262 /**
263 * Subroutines are minimal support for resusable instruction sequences.
264 * They are implemented as simply as possible to minimise overhead: there
265 * is no explicit support for communication between the caller and callee
266 * other than saving the return address in a temporary register, nor is
267 * there any automatic local storage. This implies that great care is
268 * required before attempting reentrancy or any kind of nested
269 * subroutine invocations.
270 */
271 static void invoke_subroutine( struct brw_wm_compile *c,
272 enum _subroutine subroutine,
273 void (*emit)( struct brw_wm_compile * ) )
274 {
275 struct brw_compile *p = &c->func;
276
277 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
278
279 if( c->subroutines[ subroutine ] ) {
280 /* subroutine previously emitted: reuse existing instructions */
281
282 int mark = mark_tmps( c );
283 struct brw_reg return_address = retype( alloc_tmp( c ),
284 BRW_REGISTER_TYPE_UD );
285 int here = p->nr_insn;
286
287 brw_push_insn_state(p);
288 brw_set_mask_control(p, BRW_MASK_DISABLE);
289 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
290
291 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
292 brw_imm_d( ( c->subroutines[ subroutine ] -
293 here - 1 ) << 4 ) );
294 brw_pop_insn_state(p);
295
296 release_tmps( c, mark );
297 } else {
298 /* previously unused subroutine: emit, and mark for later reuse */
299
300 int mark = mark_tmps( c );
301 struct brw_reg return_address = retype( alloc_tmp( c ),
302 BRW_REGISTER_TYPE_UD );
303 struct brw_instruction *calc;
304 int base = p->nr_insn;
305
306 brw_push_insn_state(p);
307 brw_set_mask_control(p, BRW_MASK_DISABLE);
308 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
309 brw_pop_insn_state(p);
310
311 c->subroutines[ subroutine ] = p->nr_insn;
312
313 emit( c );
314
315 brw_push_insn_state(p);
316 brw_set_mask_control(p, BRW_MASK_DISABLE);
317 brw_MOV( p, brw_ip_reg(), return_address );
318 brw_pop_insn_state(p);
319
320 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
321
322 release_tmps( c, mark );
323 }
324 }
325
326 static void emit_abs( struct brw_wm_compile *c,
327 struct prog_instruction *inst)
328 {
329 int i;
330 struct brw_compile *p = &c->func;
331 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
332 for (i = 0; i < 4; i++) {
333 if (inst->DstReg.WriteMask & (1<<i)) {
334 struct brw_reg src, dst;
335 dst = get_dst_reg(c, inst, i, 1);
336 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
337 brw_MOV(p, dst, brw_abs(src));
338 }
339 }
340 brw_set_saturate(p, 0);
341 }
342
343 static void emit_trunc( struct brw_wm_compile *c,
344 struct prog_instruction *inst)
345 {
346 int i;
347 struct brw_compile *p = &c->func;
348 GLuint mask = inst->DstReg.WriteMask;
349 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
350 for (i = 0; i < 4; i++) {
351 if (mask & (1<<i)) {
352 struct brw_reg src, dst;
353 dst = get_dst_reg(c, inst, i, 1) ;
354 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
355 brw_RNDZ(p, dst, src);
356 }
357 }
358 brw_set_saturate(p, 0);
359 }
360
361 static void emit_mov( struct brw_wm_compile *c,
362 struct prog_instruction *inst)
363 {
364 int i;
365 struct brw_compile *p = &c->func;
366 GLuint mask = inst->DstReg.WriteMask;
367 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
368 for (i = 0; i < 4; i++) {
369 if (mask & (1<<i)) {
370 struct brw_reg src, dst;
371 dst = get_dst_reg(c, inst, i, 1);
372 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
373 brw_MOV(p, dst, src);
374 }
375 }
376 brw_set_saturate(p, 0);
377 }
378
379 static void emit_pixel_xy(struct brw_wm_compile *c,
380 struct prog_instruction *inst)
381 {
382 struct brw_reg r1 = brw_vec1_grf(1, 0);
383 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
384
385 struct brw_reg dst0, dst1;
386 struct brw_compile *p = &c->func;
387 GLuint mask = inst->DstReg.WriteMask;
388
389 dst0 = get_dst_reg(c, inst, 0, 1);
390 dst1 = get_dst_reg(c, inst, 1, 1);
391 /* Calculate pixel centers by adding 1 or 0 to each of the
392 * micro-tile coordinates passed in r1.
393 */
394 if (mask & WRITEMASK_X) {
395 brw_ADD(p,
396 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
397 stride(suboffset(r1_uw, 4), 2, 4, 0),
398 brw_imm_v(0x10101010));
399 }
400
401 if (mask & WRITEMASK_Y) {
402 brw_ADD(p,
403 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
404 stride(suboffset(r1_uw, 5), 2, 4, 0),
405 brw_imm_v(0x11001100));
406 }
407 }
408
409 static void emit_delta_xy(struct brw_wm_compile *c,
410 struct prog_instruction *inst)
411 {
412 struct brw_reg r1 = brw_vec1_grf(1, 0);
413 struct brw_reg dst0, dst1, src0, src1;
414 struct brw_compile *p = &c->func;
415 GLuint mask = inst->DstReg.WriteMask;
416
417 dst0 = get_dst_reg(c, inst, 0, 1);
418 dst1 = get_dst_reg(c, inst, 1, 1);
419 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
420 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
421 /* Calc delta X,Y by subtracting origin in r1 from the pixel
422 * centers.
423 */
424 if (mask & WRITEMASK_X) {
425 brw_ADD(p,
426 dst0,
427 retype(src0, BRW_REGISTER_TYPE_UW),
428 negate(r1));
429 }
430
431 if (mask & WRITEMASK_Y) {
432 brw_ADD(p,
433 dst1,
434 retype(src1, BRW_REGISTER_TYPE_UW),
435 negate(suboffset(r1,1)));
436
437 }
438 }
439
440 static void fire_fb_write( struct brw_wm_compile *c,
441 GLuint base_reg,
442 GLuint nr,
443 GLuint target,
444 GLuint eot)
445 {
446 struct brw_compile *p = &c->func;
447 /* Pass through control information:
448 */
449 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
450 {
451 brw_push_insn_state(p);
452 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
453 brw_MOV(p,
454 brw_message_reg(base_reg + 1),
455 brw_vec8_grf(1, 0));
456 brw_pop_insn_state(p);
457 }
458 /* Send framebuffer write message: */
459 brw_fb_WRITE(p,
460 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
461 base_reg,
462 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
463 target,
464 nr,
465 0,
466 eot);
467 }
468
469 static void emit_fb_write(struct brw_wm_compile *c,
470 struct prog_instruction *inst)
471 {
472 struct brw_compile *p = &c->func;
473 int nr = 2;
474 int channel;
475 GLuint target, eot;
476 struct brw_reg src0;
477
478 /* Reserve a space for AA - may not be needed:
479 */
480 if (c->key.aa_dest_stencil_reg)
481 nr += 1;
482
483 brw_push_insn_state(p);
484 for (channel = 0; channel < 4; channel++) {
485 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
486 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
487 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
488 brw_MOV(p, brw_message_reg(nr + channel), src0);
489 }
490 /* skip over the regs populated above: */
491 nr += 8;
492 brw_pop_insn_state(p);
493
494 if (c->key.source_depth_to_render_target) {
495 if (c->key.computes_depth) {
496 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
497 brw_MOV(p, brw_message_reg(nr), src0);
498 }
499 else {
500 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
501 brw_MOV(p, brw_message_reg(nr), src0);
502 }
503
504 nr += 2;
505 }
506
507 if (c->key.dest_depth_reg) {
508 GLuint comp = c->key.dest_depth_reg / 2;
509 GLuint off = c->key.dest_depth_reg % 2;
510
511 assert(comp == 1);
512 assert(off == 0);
513 #if 0
514 /* XXX do we need this code? comp always 1, off always 0, it seems */
515 if (off != 0) {
516 brw_push_insn_state(p);
517 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
518
519 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
520 /* 2nd half? */
521 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
522 brw_pop_insn_state(p);
523 }
524 else
525 #endif
526 {
527 struct brw_reg src = get_src_reg(c, &inst->SrcReg[1], 1, 1);
528 brw_MOV(p, brw_message_reg(nr), src);
529 }
530 nr += 2;
531 }
532
533 target = inst->Aux >> 1;
534 eot = inst->Aux & 1;
535 fire_fb_write(c, 0, nr, target, eot);
536 }
537
538 static void emit_pixel_w( struct brw_wm_compile *c,
539 struct prog_instruction *inst)
540 {
541 struct brw_compile *p = &c->func;
542 GLuint mask = inst->DstReg.WriteMask;
543 if (mask & WRITEMASK_W) {
544 struct brw_reg dst, src0, delta0, delta1;
545 struct brw_reg interp3;
546
547 dst = get_dst_reg(c, inst, 3, 1);
548 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
549 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
550 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
551
552 interp3 = brw_vec1_grf(src0.nr+1, 4);
553 /* Calc 1/w - just linterp wpos[3] optimized by putting the
554 * result straight into a message reg.
555 */
556 brw_LINE(p, brw_null_reg(), interp3, delta0);
557 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
558
559 /* Calc w */
560 brw_math_16( p, dst,
561 BRW_MATH_FUNCTION_INV,
562 BRW_MATH_SATURATE_NONE,
563 2, brw_null_reg(),
564 BRW_MATH_PRECISION_FULL);
565 }
566 }
567
568 static void emit_linterp(struct brw_wm_compile *c,
569 struct prog_instruction *inst)
570 {
571 struct brw_compile *p = &c->func;
572 GLuint mask = inst->DstReg.WriteMask;
573 struct brw_reg interp[4];
574 struct brw_reg dst, delta0, delta1;
575 struct brw_reg src0;
576 GLuint nr, i;
577
578 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
579 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
580 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
581 nr = src0.nr;
582
583 interp[0] = brw_vec1_grf(nr, 0);
584 interp[1] = brw_vec1_grf(nr, 4);
585 interp[2] = brw_vec1_grf(nr+1, 0);
586 interp[3] = brw_vec1_grf(nr+1, 4);
587
588 for(i = 0; i < 4; i++ ) {
589 if (mask & (1<<i)) {
590 dst = get_dst_reg(c, inst, i, 1);
591 brw_LINE(p, brw_null_reg(), interp[i], delta0);
592 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
593 }
594 }
595 }
596
597 static void emit_cinterp(struct brw_wm_compile *c,
598 struct prog_instruction *inst)
599 {
600 struct brw_compile *p = &c->func;
601 GLuint mask = inst->DstReg.WriteMask;
602
603 struct brw_reg interp[4];
604 struct brw_reg dst, src0;
605 GLuint nr, i;
606
607 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
608 nr = src0.nr;
609
610 interp[0] = brw_vec1_grf(nr, 0);
611 interp[1] = brw_vec1_grf(nr, 4);
612 interp[2] = brw_vec1_grf(nr+1, 0);
613 interp[3] = brw_vec1_grf(nr+1, 4);
614
615 for(i = 0; i < 4; i++ ) {
616 if (mask & (1<<i)) {
617 dst = get_dst_reg(c, inst, i, 1);
618 brw_MOV(p, dst, suboffset(interp[i],3));
619 }
620 }
621 }
622
623 static void emit_pinterp(struct brw_wm_compile *c,
624 struct prog_instruction *inst)
625 {
626 struct brw_compile *p = &c->func;
627 GLuint mask = inst->DstReg.WriteMask;
628
629 struct brw_reg interp[4];
630 struct brw_reg dst, delta0, delta1;
631 struct brw_reg src0, w;
632 GLuint nr, i;
633
634 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
635 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
636 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
637 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
638 nr = src0.nr;
639
640 interp[0] = brw_vec1_grf(nr, 0);
641 interp[1] = brw_vec1_grf(nr, 4);
642 interp[2] = brw_vec1_grf(nr+1, 0);
643 interp[3] = brw_vec1_grf(nr+1, 4);
644
645 for(i = 0; i < 4; i++ ) {
646 if (mask & (1<<i)) {
647 dst = get_dst_reg(c, inst, i, 1);
648 brw_LINE(p, brw_null_reg(), interp[i], delta0);
649 brw_MAC(p, dst, suboffset(interp[i],1),
650 delta1);
651 brw_MUL(p, dst, dst, w);
652 }
653 }
654 }
655
656 static void emit_xpd(struct brw_wm_compile *c,
657 struct prog_instruction *inst)
658 {
659 int i;
660 struct brw_compile *p = &c->func;
661 GLuint mask = inst->DstReg.WriteMask;
662 for (i = 0; i < 4; i++) {
663 GLuint i2 = (i+2)%3;
664 GLuint i1 = (i+1)%3;
665 if (mask & (1<<i)) {
666 struct brw_reg src0, src1, dst;
667 dst = get_dst_reg(c, inst, i, 1);
668 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
669 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
670 brw_MUL(p, brw_null_reg(), src0, src1);
671 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
672 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
673 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
674 brw_MAC(p, dst, src0, src1);
675 brw_set_saturate(p, 0);
676 }
677 }
678 brw_set_saturate(p, 0);
679 }
680
681 static void emit_dp3(struct brw_wm_compile *c,
682 struct prog_instruction *inst)
683 {
684 struct brw_reg src0[3], src1[3], dst;
685 int i;
686 struct brw_compile *p = &c->func;
687 for (i = 0; i < 3; i++) {
688 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
689 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
690 }
691
692 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
693 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
694 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
695 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
696 brw_MAC(p, dst, src0[2], src1[2]);
697 brw_set_saturate(p, 0);
698 }
699
700 static void emit_dp4(struct brw_wm_compile *c,
701 struct prog_instruction *inst)
702 {
703 struct brw_reg src0[4], src1[4], dst;
704 int i;
705 struct brw_compile *p = &c->func;
706 for (i = 0; i < 4; i++) {
707 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
708 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
709 }
710 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
711 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
712 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
713 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
714 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
715 brw_MAC(p, dst, src0[3], src1[3]);
716 brw_set_saturate(p, 0);
717 }
718
719 static void emit_dph(struct brw_wm_compile *c,
720 struct prog_instruction *inst)
721 {
722 struct brw_reg src0[4], src1[4], dst;
723 int i;
724 struct brw_compile *p = &c->func;
725 for (i = 0; i < 4; i++) {
726 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
727 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
728 }
729 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
730 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
731 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
732 brw_MAC(p, dst, src0[2], src1[2]);
733 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
734 brw_ADD(p, dst, dst, src1[3]);
735 brw_set_saturate(p, 0);
736 }
737
738 /**
739 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
740 * Note that the result of the function is smeared across the dest
741 * register's X, Y, Z and W channels (subject to writemasking of course).
742 */
743 static void emit_math1(struct brw_wm_compile *c,
744 struct prog_instruction *inst, GLuint func)
745 {
746 struct brw_compile *p = &c->func;
747 struct brw_reg src0, dst, tmp;
748 const int mark = mark_tmps( c );
749 int i;
750
751 tmp = alloc_tmp(c);
752
753 /* Get first component of source register */
754 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
755
756 /* tmp = func(src0) */
757 brw_MOV(p, brw_message_reg(2), src0);
758 brw_math(p,
759 tmp,
760 func,
761 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
762 2,
763 brw_null_reg(),
764 BRW_MATH_DATA_VECTOR,
765 BRW_MATH_PRECISION_FULL);
766
767 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
768
769 /* replicate tmp value across enabled dest channels */
770 for (i = 0; i < 4; i++) {
771 if (inst->DstReg.WriteMask & (1 << i)) {
772 dst = get_dst_reg(c, inst, i, 1);
773 brw_MOV(p, dst, tmp);
774 }
775 }
776
777 release_tmps(c, mark);
778 }
779
780 static void emit_rcp(struct brw_wm_compile *c,
781 struct prog_instruction *inst)
782 {
783 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
784 }
785
786 static void emit_rsq(struct brw_wm_compile *c,
787 struct prog_instruction *inst)
788 {
789 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
790 }
791
792 static void emit_sin(struct brw_wm_compile *c,
793 struct prog_instruction *inst)
794 {
795 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
796 }
797
798 static void emit_cos(struct brw_wm_compile *c,
799 struct prog_instruction *inst)
800 {
801 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
802 }
803
804 static void emit_ex2(struct brw_wm_compile *c,
805 struct prog_instruction *inst)
806 {
807 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
808 }
809
810 static void emit_lg2(struct brw_wm_compile *c,
811 struct prog_instruction *inst)
812 {
813 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
814 }
815
816 static void emit_add(struct brw_wm_compile *c,
817 struct prog_instruction *inst)
818 {
819 struct brw_compile *p = &c->func;
820 struct brw_reg src0, src1, dst;
821 GLuint mask = inst->DstReg.WriteMask;
822 int i;
823 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
824 for (i = 0 ; i < 4; i++) {
825 if (mask & (1<<i)) {
826 dst = get_dst_reg(c, inst, i, 1);
827 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
828 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
829 brw_ADD(p, dst, src0, src1);
830 }
831 }
832 brw_set_saturate(p, 0);
833 }
834
835 static void emit_sub(struct brw_wm_compile *c,
836 struct prog_instruction *inst)
837 {
838 struct brw_compile *p = &c->func;
839 struct brw_reg src0, src1, dst;
840 GLuint mask = inst->DstReg.WriteMask;
841 int i;
842 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
843 for (i = 0 ; i < 4; i++) {
844 if (mask & (1<<i)) {
845 dst = get_dst_reg(c, inst, i, 1);
846 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
847 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
848 brw_ADD(p, dst, src0, negate(src1));
849 }
850 }
851 brw_set_saturate(p, 0);
852 }
853
854 static void emit_mul(struct brw_wm_compile *c,
855 struct prog_instruction *inst)
856 {
857 struct brw_compile *p = &c->func;
858 struct brw_reg src0, src1, dst;
859 GLuint mask = inst->DstReg.WriteMask;
860 int i;
861 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
862 for (i = 0 ; i < 4; i++) {
863 if (mask & (1<<i)) {
864 dst = get_dst_reg(c, inst, i, 1);
865 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
866 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
867 brw_MUL(p, dst, src0, src1);
868 }
869 }
870 brw_set_saturate(p, 0);
871 }
872
873 static void emit_frc(struct brw_wm_compile *c,
874 struct prog_instruction *inst)
875 {
876 struct brw_compile *p = &c->func;
877 struct brw_reg src0, dst;
878 GLuint mask = inst->DstReg.WriteMask;
879 int i;
880 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
881 for (i = 0 ; i < 4; i++) {
882 if (mask & (1<<i)) {
883 dst = get_dst_reg(c, inst, i, 1);
884 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
885 brw_FRC(p, dst, src0);
886 }
887 }
888 if (inst->SaturateMode != SATURATE_OFF)
889 brw_set_saturate(p, 0);
890 }
891
892 static void emit_flr(struct brw_wm_compile *c,
893 struct prog_instruction *inst)
894 {
895 struct brw_compile *p = &c->func;
896 struct brw_reg src0, dst;
897 GLuint mask = inst->DstReg.WriteMask;
898 int i;
899 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
900 for (i = 0 ; i < 4; i++) {
901 if (mask & (1<<i)) {
902 dst = get_dst_reg(c, inst, i, 1);
903 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
904 brw_RNDD(p, dst, src0);
905 }
906 }
907 brw_set_saturate(p, 0);
908 }
909
910 static void emit_max(struct brw_wm_compile *c,
911 struct prog_instruction *inst)
912 {
913 struct brw_compile *p = &c->func;
914 GLuint mask = inst->DstReg.WriteMask;
915 struct brw_reg src0, src1, dst;
916 int i;
917 brw_push_insn_state(p);
918 for (i = 0; i < 4; i++) {
919 if (mask & (1<<i)) {
920 dst = get_dst_reg(c, inst, i, 1);
921 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
922 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
923 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
924 brw_MOV(p, dst, src0);
925 brw_set_saturate(p, 0);
926
927 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
928 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
929 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
930 brw_MOV(p, dst, src1);
931 brw_set_saturate(p, 0);
932 brw_set_predicate_control_flag_value(p, 0xff);
933 }
934 }
935 brw_pop_insn_state(p);
936 }
937
938 static void emit_min(struct brw_wm_compile *c,
939 struct prog_instruction *inst)
940 {
941 struct brw_compile *p = &c->func;
942 GLuint mask = inst->DstReg.WriteMask;
943 struct brw_reg src0, src1, dst;
944 int i;
945 brw_push_insn_state(p);
946 for (i = 0; i < 4; i++) {
947 if (mask & (1<<i)) {
948 dst = get_dst_reg(c, inst, i, 1);
949 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
950 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
951 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
952 brw_MOV(p, dst, src0);
953 brw_set_saturate(p, 0);
954
955 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
956 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
957 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
958 brw_MOV(p, dst, src1);
959 brw_set_saturate(p, 0);
960 brw_set_predicate_control_flag_value(p, 0xff);
961 }
962 }
963 brw_pop_insn_state(p);
964 }
965
966 static void emit_pow(struct brw_wm_compile *c,
967 struct prog_instruction *inst)
968 {
969 struct brw_compile *p = &c->func;
970 struct brw_reg dst, src0, src1;
971 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
972 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
973 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
974
975 brw_MOV(p, brw_message_reg(2), src0);
976 brw_MOV(p, brw_message_reg(3), src1);
977
978 brw_math(p,
979 dst,
980 BRW_MATH_FUNCTION_POW,
981 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
982 2,
983 brw_null_reg(),
984 BRW_MATH_DATA_VECTOR,
985 BRW_MATH_PRECISION_FULL);
986 }
987
988 static void emit_lrp(struct brw_wm_compile *c,
989 struct prog_instruction *inst)
990 {
991 struct brw_compile *p = &c->func;
992 GLuint mask = inst->DstReg.WriteMask;
993 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
994 int i;
995 int mark = mark_tmps(c);
996 for (i = 0; i < 4; i++) {
997 if (mask & (1<<i)) {
998 dst = get_dst_reg(c, inst, i, 1);
999 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1000
1001 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1002
1003 if (src1.nr == dst.nr) {
1004 tmp1 = alloc_tmp(c);
1005 brw_MOV(p, tmp1, src1);
1006 } else
1007 tmp1 = src1;
1008
1009 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
1010 if (src2.nr == dst.nr) {
1011 tmp2 = alloc_tmp(c);
1012 brw_MOV(p, tmp2, src2);
1013 } else
1014 tmp2 = src2;
1015
1016 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1017 brw_MUL(p, brw_null_reg(), dst, tmp2);
1018 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1019 brw_MAC(p, dst, src0, tmp1);
1020 brw_set_saturate(p, 0);
1021 }
1022 release_tmps(c, mark);
1023 }
1024 }
1025
1026 /**
1027 * For GLSL shaders, this KIL will be unconditional.
1028 * It may be contained inside an IF/ENDIF structure of course.
1029 */
1030 static void emit_kil(struct brw_wm_compile *c)
1031 {
1032 struct brw_compile *p = &c->func;
1033 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1034 brw_push_insn_state(p);
1035 brw_set_mask_control(p, BRW_MASK_DISABLE);
1036 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1037 brw_AND(p, depth, c->emit_mask_reg, depth);
1038 brw_pop_insn_state(p);
1039 }
1040
1041 static void emit_mad(struct brw_wm_compile *c,
1042 struct prog_instruction *inst)
1043 {
1044 struct brw_compile *p = &c->func;
1045 GLuint mask = inst->DstReg.WriteMask;
1046 struct brw_reg dst, src0, src1, src2;
1047 int i;
1048
1049 for (i = 0; i < 4; i++) {
1050 if (mask & (1<<i)) {
1051 dst = get_dst_reg(c, inst, i, 1);
1052 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1053 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1054 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
1055 brw_MUL(p, dst, src0, src1);
1056
1057 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1058 brw_ADD(p, dst, dst, src2);
1059 brw_set_saturate(p, 0);
1060 }
1061 }
1062 }
1063
1064 static void emit_sop(struct brw_wm_compile *c,
1065 struct prog_instruction *inst, GLuint cond)
1066 {
1067 struct brw_compile *p = &c->func;
1068 GLuint mask = inst->DstReg.WriteMask;
1069 struct brw_reg dst, src0, src1;
1070 int i;
1071
1072 for (i = 0; i < 4; i++) {
1073 if (mask & (1<<i)) {
1074 dst = get_dst_reg(c, inst, i, 1);
1075 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1076 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1077 brw_push_insn_state(p);
1078 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1079 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1080 brw_MOV(p, dst, brw_imm_f(0.0));
1081 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1082 brw_MOV(p, dst, brw_imm_f(1.0));
1083 brw_pop_insn_state(p);
1084 }
1085 }
1086 }
1087
1088 static void emit_slt(struct brw_wm_compile *c,
1089 struct prog_instruction *inst)
1090 {
1091 emit_sop(c, inst, BRW_CONDITIONAL_L);
1092 }
1093
1094 static void emit_sle(struct brw_wm_compile *c,
1095 struct prog_instruction *inst)
1096 {
1097 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1098 }
1099
1100 static void emit_sgt(struct brw_wm_compile *c,
1101 struct prog_instruction *inst)
1102 {
1103 emit_sop(c, inst, BRW_CONDITIONAL_G);
1104 }
1105
1106 static void emit_sge(struct brw_wm_compile *c,
1107 struct prog_instruction *inst)
1108 {
1109 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1110 }
1111
1112 static void emit_seq(struct brw_wm_compile *c,
1113 struct prog_instruction *inst)
1114 {
1115 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1116 }
1117
1118 static void emit_sne(struct brw_wm_compile *c,
1119 struct prog_instruction *inst)
1120 {
1121 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1122 }
1123
1124 static void emit_ddx(struct brw_wm_compile *c,
1125 struct prog_instruction *inst)
1126 {
1127 struct brw_compile *p = &c->func;
1128 GLuint mask = inst->DstReg.WriteMask;
1129 struct brw_reg interp[4];
1130 struct brw_reg dst;
1131 struct brw_reg src0, w;
1132 GLuint nr, i;
1133 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1134 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1135 nr = src0.nr;
1136 interp[0] = brw_vec1_grf(nr, 0);
1137 interp[1] = brw_vec1_grf(nr, 4);
1138 interp[2] = brw_vec1_grf(nr+1, 0);
1139 interp[3] = brw_vec1_grf(nr+1, 4);
1140 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1141 for(i = 0; i < 4; i++ ) {
1142 if (mask & (1<<i)) {
1143 dst = get_dst_reg(c, inst, i, 1);
1144 brw_MOV(p, dst, interp[i]);
1145 brw_MUL(p, dst, dst, w);
1146 }
1147 }
1148 brw_set_saturate(p, 0);
1149 }
1150
1151 static void emit_ddy(struct brw_wm_compile *c,
1152 struct prog_instruction *inst)
1153 {
1154 struct brw_compile *p = &c->func;
1155 GLuint mask = inst->DstReg.WriteMask;
1156 struct brw_reg interp[4];
1157 struct brw_reg dst;
1158 struct brw_reg src0, w;
1159 GLuint nr, i;
1160
1161 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1162 nr = src0.nr;
1163 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1164 interp[0] = brw_vec1_grf(nr, 0);
1165 interp[1] = brw_vec1_grf(nr, 4);
1166 interp[2] = brw_vec1_grf(nr+1, 0);
1167 interp[3] = brw_vec1_grf(nr+1, 4);
1168 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1169 for(i = 0; i < 4; i++ ) {
1170 if (mask & (1<<i)) {
1171 dst = get_dst_reg(c, inst, i, 1);
1172 brw_MOV(p, dst, suboffset(interp[i], 1));
1173 brw_MUL(p, dst, dst, w);
1174 }
1175 }
1176 brw_set_saturate(p, 0);
1177 }
1178
1179 static INLINE struct brw_reg high_words( struct brw_reg reg )
1180 {
1181 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1182 0, 8, 2 );
1183 }
1184
1185 static INLINE struct brw_reg low_words( struct brw_reg reg )
1186 {
1187 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1188 }
1189
1190 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1191 {
1192 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1193 }
1194
1195 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1196 {
1197 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1198 0, 16, 2 );
1199 }
1200
1201 /* One-, two- and three-dimensional Perlin noise, similar to the description
1202 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1203 static void noise1_sub( struct brw_wm_compile *c ) {
1204
1205 struct brw_compile *p = &c->func;
1206 struct brw_reg param,
1207 x0, x1, /* gradients at each end */
1208 t, tmp[ 2 ], /* float temporaries */
1209 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1210 int i;
1211 int mark = mark_tmps( c );
1212
1213 x0 = alloc_tmp( c );
1214 x1 = alloc_tmp( c );
1215 t = alloc_tmp( c );
1216 tmp[ 0 ] = alloc_tmp( c );
1217 tmp[ 1 ] = alloc_tmp( c );
1218 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1219 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1220 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1221 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1222 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1223
1224 param = lookup_tmp( c, mark - 2 );
1225
1226 brw_set_access_mode( p, BRW_ALIGN_1 );
1227
1228 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1229
1230 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1231 be hashed. Also compute the remainder (offset within the unit
1232 length), interleaved to reduce register dependency penalties. */
1233 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1234 brw_FRC( p, param, param );
1235 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1236 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1237 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1238
1239 /* We're now ready to perform the hashing. The two hashes are
1240 interleaved for performance. The hash function used is
1241 designed to rapidly achieve avalanche and require only 32x16
1242 bit multiplication, and 16-bit swizzles (which we get for
1243 free). We can't use immediate operands in the multiplies,
1244 because immediates are permitted only in src1 and the 16-bit
1245 factor is permitted only in src0. */
1246 for( i = 0; i < 2; i++ )
1247 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1248 for( i = 0; i < 2; i++ )
1249 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1250 high_words( itmp[ i ] ) );
1251 for( i = 0; i < 2; i++ )
1252 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1253 for( i = 0; i < 2; i++ )
1254 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1255 high_words( itmp[ i ] ) );
1256 for( i = 0; i < 2; i++ )
1257 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1258 for( i = 0; i < 2; i++ )
1259 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1260 high_words( itmp[ i ] ) );
1261
1262 /* Now we want to initialise the two gradients based on the
1263 hashes. Format conversion from signed integer to float leaves
1264 everything scaled too high by a factor of pow( 2, 31 ), but
1265 we correct for that right at the end. */
1266 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1267 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1268 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1269
1270 brw_MUL( p, x0, x0, param );
1271 brw_MUL( p, x1, x1, t );
1272
1273 /* We interpolate between the gradients using the polynomial
1274 6t^5 - 15t^4 + 10t^3 (Perlin). */
1275 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1276 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1277 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1278 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1279 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1280 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1281 pipeline */
1282 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1283 brw_MUL( p, param, tmp[ 0 ], param );
1284 brw_MUL( p, x1, x1, param );
1285 brw_ADD( p, x0, x0, x1 );
1286 /* scale by pow( 2, -30 ), to compensate for the format conversion
1287 above and an extra factor of 2 so that a single gradient covers
1288 the [-1,1] range */
1289 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1290
1291 release_tmps( c, mark );
1292 }
1293
1294 static void emit_noise1( struct brw_wm_compile *c,
1295 struct prog_instruction *inst )
1296 {
1297 struct brw_compile *p = &c->func;
1298 struct brw_reg src, param, dst;
1299 GLuint mask = inst->DstReg.WriteMask;
1300 int i;
1301 int mark = mark_tmps( c );
1302
1303 assert( mark == 0 );
1304
1305 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1306
1307 param = alloc_tmp( c );
1308
1309 brw_MOV( p, param, src );
1310
1311 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1312
1313 /* Fill in the result: */
1314 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1315 for (i = 0 ; i < 4; i++) {
1316 if (mask & (1<<i)) {
1317 dst = get_dst_reg(c, inst, i, 1);
1318 brw_MOV( p, dst, param );
1319 }
1320 }
1321 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1322 brw_set_saturate( p, 0 );
1323
1324 release_tmps( c, mark );
1325 }
1326
1327 static void noise2_sub( struct brw_wm_compile *c ) {
1328
1329 struct brw_compile *p = &c->func;
1330 struct brw_reg param0, param1,
1331 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1332 t, tmp[ 4 ], /* float temporaries */
1333 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1334 int i;
1335 int mark = mark_tmps( c );
1336
1337 x0y0 = alloc_tmp( c );
1338 x0y1 = alloc_tmp( c );
1339 x1y0 = alloc_tmp( c );
1340 x1y1 = alloc_tmp( c );
1341 t = alloc_tmp( c );
1342 for( i = 0; i < 4; i++ ) {
1343 tmp[ i ] = alloc_tmp( c );
1344 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1345 }
1346 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1347 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1348 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1349
1350 param0 = lookup_tmp( c, mark - 3 );
1351 param1 = lookup_tmp( c, mark - 2 );
1352
1353 brw_set_access_mode( p, BRW_ALIGN_1 );
1354
1355 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1356 be hashed. Also compute the remainders (offsets within the unit
1357 square), interleaved to reduce register dependency penalties. */
1358 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1359 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1360 brw_FRC( p, param0, param0 );
1361 brw_FRC( p, param1, param1 );
1362 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1363 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1364 low_words( itmp[ 1 ] ) );
1365 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1366 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1367 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1368 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1369 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1370
1371 /* We're now ready to perform the hashing. The four hashes are
1372 interleaved for performance. The hash function used is
1373 designed to rapidly achieve avalanche and require only 32x16
1374 bit multiplication, and 16-bit swizzles (which we get for
1375 free). We can't use immediate operands in the multiplies,
1376 because immediates are permitted only in src1 and the 16-bit
1377 factor is permitted only in src0. */
1378 for( i = 0; i < 4; i++ )
1379 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1380 for( i = 0; i < 4; i++ )
1381 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1382 high_words( itmp[ i ] ) );
1383 for( i = 0; i < 4; i++ )
1384 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1385 for( i = 0; i < 4; i++ )
1386 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1387 high_words( itmp[ i ] ) );
1388 for( i = 0; i < 4; i++ )
1389 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1390 for( i = 0; i < 4; i++ )
1391 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1392 high_words( itmp[ i ] ) );
1393
1394 /* Now we want to initialise the four gradients based on the
1395 hashes. Format conversion from signed integer to float leaves
1396 everything scaled too high by a factor of pow( 2, 15 ), but
1397 we correct for that right at the end. */
1398 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1399 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1400 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1401 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1402 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1403
1404 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1405 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1406 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1407 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1408
1409 brw_MUL( p, x1y0, x1y0, t );
1410 brw_MUL( p, x1y1, x1y1, t );
1411 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1412 brw_MUL( p, x0y0, x0y0, param0 );
1413 brw_MUL( p, x0y1, x0y1, param0 );
1414
1415 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1416 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1417 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1418 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1419
1420 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1421 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1422 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1423 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1424
1425 /* We interpolate between the gradients using the polynomial
1426 6t^5 - 15t^4 + 10t^3 (Perlin). */
1427 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1428 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1429 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1430 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1431 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1432 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1433 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1434 pipeline */
1435 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1436 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1437 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1438 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1439 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1440 pipeline */
1441 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1442 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1443 brw_MUL( p, param0, tmp[ 0 ], param0 );
1444 brw_MUL( p, param1, tmp[ 1 ], param1 );
1445
1446 /* Here we interpolate in the y dimension... */
1447 brw_MUL( p, x0y1, x0y1, param1 );
1448 brw_MUL( p, x1y1, x1y1, param1 );
1449 brw_ADD( p, x0y0, x0y0, x0y1 );
1450 brw_ADD( p, x1y0, x1y0, x1y1 );
1451
1452 /* And now in x. There are horrible register dependencies here,
1453 but we have nothing else to do. */
1454 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1455 brw_MUL( p, x1y0, x1y0, param0 );
1456 brw_ADD( p, x0y0, x0y0, x1y0 );
1457
1458 /* scale by pow( 2, -15 ), as described above */
1459 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1460
1461 release_tmps( c, mark );
1462 }
1463
1464 static void emit_noise2( struct brw_wm_compile *c,
1465 struct prog_instruction *inst )
1466 {
1467 struct brw_compile *p = &c->func;
1468 struct brw_reg src0, src1, param0, param1, dst;
1469 GLuint mask = inst->DstReg.WriteMask;
1470 int i;
1471 int mark = mark_tmps( c );
1472
1473 assert( mark == 0 );
1474
1475 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1476 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1477
1478 param0 = alloc_tmp( c );
1479 param1 = alloc_tmp( c );
1480
1481 brw_MOV( p, param0, src0 );
1482 brw_MOV( p, param1, src1 );
1483
1484 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1485
1486 /* Fill in the result: */
1487 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1488 for (i = 0 ; i < 4; i++) {
1489 if (mask & (1<<i)) {
1490 dst = get_dst_reg(c, inst, i, 1);
1491 brw_MOV( p, dst, param0 );
1492 }
1493 }
1494 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1495 brw_set_saturate( p, 0 );
1496
1497 release_tmps( c, mark );
1498 }
1499
1500 /**
1501 * The three-dimensional case is much like the one- and two- versions above,
1502 * but since the number of corners is rapidly growing we now pack 16 16-bit
1503 * hashes into each register to extract more parallelism from the EUs.
1504 */
1505 static void noise3_sub( struct brw_wm_compile *c ) {
1506
1507 struct brw_compile *p = &c->func;
1508 struct brw_reg param0, param1, param2,
1509 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1510 xi, yi, zi, /* interpolation coefficients */
1511 t, tmp[ 8 ], /* float temporaries */
1512 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1513 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1514 int i;
1515 int mark = mark_tmps( c );
1516
1517 x0y0 = alloc_tmp( c );
1518 x0y1 = alloc_tmp( c );
1519 x1y0 = alloc_tmp( c );
1520 x1y1 = alloc_tmp( c );
1521 xi = alloc_tmp( c );
1522 yi = alloc_tmp( c );
1523 zi = alloc_tmp( c );
1524 t = alloc_tmp( c );
1525 for( i = 0; i < 8; i++ ) {
1526 tmp[ i ] = alloc_tmp( c );
1527 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1528 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1529 }
1530
1531 param0 = lookup_tmp( c, mark - 4 );
1532 param1 = lookup_tmp( c, mark - 3 );
1533 param2 = lookup_tmp( c, mark - 2 );
1534
1535 brw_set_access_mode( p, BRW_ALIGN_1 );
1536
1537 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1538 be hashed. Also compute the remainders (offsets within the unit
1539 cube), interleaved to reduce register dependency penalties. */
1540 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1541 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1542 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1543 brw_FRC( p, param0, param0 );
1544 brw_FRC( p, param1, param1 );
1545 brw_FRC( p, param2, param2 );
1546 /* Since we now have only 16 bits of precision in the hash, we must
1547 be more careful about thorough mixing to maintain entropy as we
1548 squash the input vector into a small scalar. */
1549 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1550 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1551 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1552 brw_imm_uw( 0x9B93 ) );
1553 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1554 brw_imm_uw( 0xBC8F ) );
1555
1556 /* Temporarily disable the execution mask while we work with ExecSize=16
1557 channels (the mask is set for ExecSize=8 and is probably incorrect).
1558 Although this might cause execution of unwanted channels, the code
1559 writes only to temporary registers and has no side effects, so
1560 disabling the mask is harmless. */
1561 brw_push_insn_state( p );
1562 brw_set_mask_control( p, BRW_MASK_DISABLE );
1563 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1564 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1565 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1566
1567 /* We're now ready to perform the hashing. The eight hashes are
1568 interleaved for performance. The hash function used is
1569 designed to rapidly achieve avalanche and require only 16x16
1570 bit multiplication, and 8-bit swizzles (which we get for
1571 free). */
1572 for( i = 0; i < 4; i++ )
1573 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1574 for( i = 0; i < 4; i++ )
1575 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1576 odd_bytes( wtmp[ i ] ) );
1577 for( i = 0; i < 4; i++ )
1578 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1579 for( i = 0; i < 4; i++ )
1580 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1581 odd_bytes( wtmp[ i ] ) );
1582 brw_pop_insn_state( p );
1583
1584 /* Now we want to initialise the four rear gradients based on the
1585 hashes. Format conversion from signed integer to float leaves
1586 everything scaled too high by a factor of pow( 2, 15 ), but
1587 we correct for that right at the end. */
1588 /* x component */
1589 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1590 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1591 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1592 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1593 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1594
1595 brw_push_insn_state( p );
1596 brw_set_mask_control( p, BRW_MASK_DISABLE );
1597 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1598 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1599 brw_pop_insn_state( p );
1600
1601 brw_MUL( p, x1y0, x1y0, t );
1602 brw_MUL( p, x1y1, x1y1, t );
1603 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1604 brw_MUL( p, x0y0, x0y0, param0 );
1605 brw_MUL( p, x0y1, x0y1, param0 );
1606
1607 /* y component */
1608 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1609 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1610 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1611 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1612
1613 brw_push_insn_state( p );
1614 brw_set_mask_control( p, BRW_MASK_DISABLE );
1615 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1616 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1617 brw_pop_insn_state( p );
1618
1619 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1620 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1621 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1622 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1623 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1624
1625 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1626 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1627 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1628 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1629
1630 /* z component */
1631 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1632 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1633 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1634 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1635
1636 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1637 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1638 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1639 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1640
1641 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1642 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1643 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1644 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1645
1646 /* We interpolate between the gradients using the polynomial
1647 6t^5 - 15t^4 + 10t^3 (Perlin). */
1648 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1649 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1650 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1651 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1652 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1653 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1654 brw_MUL( p, xi, xi, param0 );
1655 brw_MUL( p, yi, yi, param1 );
1656 brw_MUL( p, zi, zi, param2 );
1657 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1658 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1659 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1660 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1661 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1662 brw_MUL( p, xi, xi, param0 );
1663 brw_MUL( p, yi, yi, param1 );
1664 brw_MUL( p, zi, zi, param2 );
1665 brw_MUL( p, xi, xi, param0 );
1666 brw_MUL( p, yi, yi, param1 );
1667 brw_MUL( p, zi, zi, param2 );
1668 brw_MUL( p, xi, xi, param0 );
1669 brw_MUL( p, yi, yi, param1 );
1670 brw_MUL( p, zi, zi, param2 );
1671
1672 /* Here we interpolate in the y dimension... */
1673 brw_MUL( p, x0y1, x0y1, yi );
1674 brw_MUL( p, x1y1, x1y1, yi );
1675 brw_ADD( p, x0y0, x0y0, x0y1 );
1676 brw_ADD( p, x1y0, x1y0, x1y1 );
1677
1678 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1679 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1680 brw_MUL( p, x1y0, x1y0, xi );
1681 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1682
1683 /* Now do the same thing for the front four gradients... */
1684 /* x component */
1685 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1686 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1687 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1688 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1689
1690 brw_push_insn_state( p );
1691 brw_set_mask_control( p, BRW_MASK_DISABLE );
1692 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1693 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1694 brw_pop_insn_state( p );
1695
1696 brw_MUL( p, x1y0, x1y0, t );
1697 brw_MUL( p, x1y1, x1y1, t );
1698 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1699 brw_MUL( p, x0y0, x0y0, param0 );
1700 brw_MUL( p, x0y1, x0y1, param0 );
1701
1702 /* y component */
1703 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1704 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1705 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1706 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1707
1708 brw_push_insn_state( p );
1709 brw_set_mask_control( p, BRW_MASK_DISABLE );
1710 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1711 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1712 brw_pop_insn_state( p );
1713
1714 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1715 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1716 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1717 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1718 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1719
1720 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1721 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1722 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1723 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1724
1725 /* z component */
1726 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1727 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1728 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1729 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1730
1731 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1732 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1733 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1734 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1735
1736 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1737 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1738 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1739 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1740
1741 /* The interpolation coefficients are still around from last time, so
1742 again interpolate in the y dimension... */
1743 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1744 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1745 brw_MUL( p, x0y1, x0y1, yi );
1746 brw_MUL( p, x1y1, x1y1, yi );
1747 brw_ADD( p, x0y0, x0y0, x0y1 );
1748 brw_ADD( p, x1y0, x1y0, x1y1 );
1749
1750 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1751 time put the front face in tmp[ 1 ] and we're nearly there... */
1752 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1753 brw_MUL( p, x1y0, x1y0, xi );
1754 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1755
1756 /* The final interpolation, in the z dimension: */
1757 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1758 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1759 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1760
1761 /* scale by pow( 2, -15 ), as described above */
1762 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1763
1764 release_tmps( c, mark );
1765 }
1766
1767 static void emit_noise3( struct brw_wm_compile *c,
1768 struct prog_instruction *inst )
1769 {
1770 struct brw_compile *p = &c->func;
1771 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1772 GLuint mask = inst->DstReg.WriteMask;
1773 int i;
1774 int mark = mark_tmps( c );
1775
1776 assert( mark == 0 );
1777
1778 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1779 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1780 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1781
1782 param0 = alloc_tmp( c );
1783 param1 = alloc_tmp( c );
1784 param2 = alloc_tmp( c );
1785
1786 brw_MOV( p, param0, src0 );
1787 brw_MOV( p, param1, src1 );
1788 brw_MOV( p, param2, src2 );
1789
1790 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1791
1792 /* Fill in the result: */
1793 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1794 for (i = 0 ; i < 4; i++) {
1795 if (mask & (1<<i)) {
1796 dst = get_dst_reg(c, inst, i, 1);
1797 brw_MOV( p, dst, param0 );
1798 }
1799 }
1800 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1801 brw_set_saturate( p, 0 );
1802
1803 release_tmps( c, mark );
1804 }
1805
1806 /**
1807 * For the four-dimensional case, the little micro-optimisation benefits
1808 * we obtain by unrolling all the loops aren't worth the massive bloat it
1809 * now causes. Instead, we loop twice around performing a similar operation
1810 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1811 * code to glue it all together.
1812 */
1813 static void noise4_sub( struct brw_wm_compile *c )
1814 {
1815 struct brw_compile *p = &c->func;
1816 struct brw_reg param[ 4 ],
1817 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1818 w0, /* noise for the w=0 cube */
1819 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1820 interp[ 4 ], /* interpolation coefficients */
1821 t, tmp[ 8 ], /* float temporaries */
1822 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1823 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1824 int i, j;
1825 int mark = mark_tmps( c );
1826 GLuint loop, origin;
1827
1828 x0y0 = alloc_tmp( c );
1829 x0y1 = alloc_tmp( c );
1830 x1y0 = alloc_tmp( c );
1831 x1y1 = alloc_tmp( c );
1832 t = alloc_tmp( c );
1833 w0 = alloc_tmp( c );
1834 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1835 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1836
1837 for( i = 0; i < 4; i++ ) {
1838 param[ i ] = lookup_tmp( c, mark - 5 + i );
1839 interp[ i ] = alloc_tmp( c );
1840 }
1841
1842 for( i = 0; i < 8; i++ ) {
1843 tmp[ i ] = alloc_tmp( c );
1844 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1845 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1846 }
1847
1848 brw_set_access_mode( p, BRW_ALIGN_1 );
1849
1850 /* We only want 16 bits of precision from the integral part of each
1851 co-ordinate, but unfortunately the RNDD semantics would saturate
1852 at 16 bits if we performed the operation directly to a 16-bit
1853 destination. Therefore, we round to 32-bit temporaries where
1854 appropriate, and then store only the lower 16 bits. */
1855 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1856 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1857 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1858 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1859 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1860 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1861
1862 /* Modify the flag register here, because the side effect is useful
1863 later (see below). We know for certain that all flags will be
1864 cleared, since the FRC instruction cannot possibly generate
1865 negative results. Even for exceptional inputs (infinities, denormals,
1866 NaNs), the architecture guarantees that the L conditional is false. */
1867 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1868 brw_FRC( p, param[ 0 ], param[ 0 ] );
1869 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1870 for( i = 1; i < 4; i++ )
1871 brw_FRC( p, param[ i ], param[ i ] );
1872
1873 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1874 of all. */
1875 for( i = 0; i < 4; i++ )
1876 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1877 for( i = 0; i < 4; i++ )
1878 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1879 for( i = 0; i < 4; i++ )
1880 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1881 for( i = 0; i < 4; i++ )
1882 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1883 for( j = 0; j < 3; j++ )
1884 for( i = 0; i < 4; i++ )
1885 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1886
1887 /* Mark the current address, as it will be a jump destination. The
1888 following code will be executed twice: first, with the flag
1889 register clear indicating the w=0 case, and second with flags
1890 set for w=1. */
1891 loop = p->nr_insn;
1892
1893 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1894 be hashed. Since we have only 16 bits of precision in the hash, we
1895 must be careful about thorough mixing to maintain entropy as we
1896 squash the input vector into a small scalar. */
1897 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1898 brw_imm_uw( 0xBC8F ) );
1899 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1900 brw_imm_uw( 0xD0BD ) );
1901 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1902 brw_imm_uw( 0x9B93 ) );
1903 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1904 brw_imm_uw( 0xA359 ) );
1905 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1906 brw_imm_uw( 0xBC8F ) );
1907
1908 /* Temporarily disable the execution mask while we work with ExecSize=16
1909 channels (the mask is set for ExecSize=8 and is probably incorrect).
1910 Although this might cause execution of unwanted channels, the code
1911 writes only to temporary registers and has no side effects, so
1912 disabling the mask is harmless. */
1913 brw_push_insn_state( p );
1914 brw_set_mask_control( p, BRW_MASK_DISABLE );
1915 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1916 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1917 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1918
1919 /* We're now ready to perform the hashing. The eight hashes are
1920 interleaved for performance. The hash function used is
1921 designed to rapidly achieve avalanche and require only 16x16
1922 bit multiplication, and 8-bit swizzles (which we get for
1923 free). */
1924 for( i = 0; i < 4; i++ )
1925 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1926 for( i = 0; i < 4; i++ )
1927 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1928 odd_bytes( wtmp[ i ] ) );
1929 for( i = 0; i < 4; i++ )
1930 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1931 for( i = 0; i < 4; i++ )
1932 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1933 odd_bytes( wtmp[ i ] ) );
1934 brw_pop_insn_state( p );
1935
1936 /* Now we want to initialise the four rear gradients based on the
1937 hashes. Format conversion from signed integer to float leaves
1938 everything scaled too high by a factor of pow( 2, 15 ), but
1939 we correct for that right at the end. */
1940 /* x component */
1941 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1942 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1943 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1944 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1945 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1946
1947 brw_push_insn_state( p );
1948 brw_set_mask_control( p, BRW_MASK_DISABLE );
1949 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1950 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1951 brw_pop_insn_state( p );
1952
1953 brw_MUL( p, x1y0, x1y0, t );
1954 brw_MUL( p, x1y1, x1y1, t );
1955 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1956 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1957 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1958
1959 /* y component */
1960 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1961 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1962 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1963 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1964
1965 brw_push_insn_state( p );
1966 brw_set_mask_control( p, BRW_MASK_DISABLE );
1967 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1968 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1969 brw_pop_insn_state( p );
1970
1971 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1972 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1973 /* prepare t for the w component (used below): w the first time through
1974 the loop; w - 1 the second time) */
1975 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1976 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1977 p->current->header.predicate_inverse = 1;
1978 brw_MOV( p, t, param[ 3 ] );
1979 p->current->header.predicate_inverse = 0;
1980 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1981 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1982 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1983
1984 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1985 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1986 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1987 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1988
1989 /* z component */
1990 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1991 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1992 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1993 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1994
1995 brw_push_insn_state( p );
1996 brw_set_mask_control( p, BRW_MASK_DISABLE );
1997 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1998 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1999 brw_pop_insn_state( p );
2000
2001 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2002 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2003 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2004 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2005
2006 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2007 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2008 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2009 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2010
2011 /* w component */
2012 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2013 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2014 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2015 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2016
2017 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2018 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2019 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2020 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2021 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2022
2023 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2024 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2025 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2026 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2027
2028 /* Here we interpolate in the y dimension... */
2029 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2030 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2031 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2032 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2033 brw_ADD( p, x0y0, x0y0, x0y1 );
2034 brw_ADD( p, x1y0, x1y0, x1y1 );
2035
2036 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2037 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2038 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2039 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2040
2041 /* Now do the same thing for the front four gradients... */
2042 /* x component */
2043 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2044 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2045 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2046 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2047
2048 brw_push_insn_state( p );
2049 brw_set_mask_control( p, BRW_MASK_DISABLE );
2050 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2051 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2052 brw_pop_insn_state( p );
2053
2054 brw_MUL( p, x1y0, x1y0, t );
2055 brw_MUL( p, x1y1, x1y1, t );
2056 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2057 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2058 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2059
2060 /* y component */
2061 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2062 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2063 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2064 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2065
2066 brw_push_insn_state( p );
2067 brw_set_mask_control( p, BRW_MASK_DISABLE );
2068 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2069 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2070 brw_pop_insn_state( p );
2071
2072 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2073 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2074 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2075 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2076 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2077
2078 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2079 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2080 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2081 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2082
2083 /* z component */
2084 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2085 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2086 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2087 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2088
2089 brw_push_insn_state( p );
2090 brw_set_mask_control( p, BRW_MASK_DISABLE );
2091 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2092 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2093 brw_pop_insn_state( p );
2094
2095 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2096 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2097 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2098 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2099 /* prepare t for the w component (used below): w the first time through
2100 the loop; w - 1 the second time) */
2101 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2102 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2103 p->current->header.predicate_inverse = 1;
2104 brw_MOV( p, t, param[ 3 ] );
2105 p->current->header.predicate_inverse = 0;
2106 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2107
2108 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2109 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2110 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2111 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2112
2113 /* w component */
2114 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2115 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2116 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2117 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2118
2119 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2120 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2121 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2122 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2123
2124 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2125 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2126 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2127 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2128
2129 /* Interpolate in the y dimension: */
2130 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2131 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2132 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2133 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2134 brw_ADD( p, x0y0, x0y0, x0y1 );
2135 brw_ADD( p, x1y0, x1y0, x1y1 );
2136
2137 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2138 time put the front face in tmp[ 1 ] and we're nearly there... */
2139 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2140 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2141 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2142
2143 /* Another interpolation, in the z dimension: */
2144 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2145 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2146 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2147
2148 /* Exit the loop if we've computed both cubes... */
2149 origin = p->nr_insn;
2150 brw_push_insn_state( p );
2151 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2152 brw_set_mask_control( p, BRW_MASK_DISABLE );
2153 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2154 brw_pop_insn_state( p );
2155
2156 /* Save the result for the w=0 case, and increment the w coordinate: */
2157 brw_MOV( p, w0, tmp[ 0 ] );
2158 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2159 brw_imm_uw( 1 ) );
2160
2161 /* Loop around for the other cube. Explicitly set the flag register
2162 (unfortunately we must spend an extra instruction to do this: we
2163 can't rely on a side effect of the previous MOV or ADD because
2164 conditional modifiers which are normally true might be false in
2165 exceptional circumstances, e.g. given a NaN input; the add to
2166 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2167 brw_push_insn_state( p );
2168 brw_set_mask_control( p, BRW_MASK_DISABLE );
2169 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2170 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2171 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2172 brw_pop_insn_state( p );
2173
2174 /* Patch the previous conditional branch now that we know the
2175 destination address. */
2176 brw_set_src1( p->store + origin,
2177 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2178
2179 /* The very last interpolation. */
2180 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2181 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2182 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2183
2184 /* scale by pow( 2, -15 ), as described above */
2185 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2186
2187 release_tmps( c, mark );
2188 }
2189
2190 static void emit_noise4( struct brw_wm_compile *c,
2191 struct prog_instruction *inst )
2192 {
2193 struct brw_compile *p = &c->func;
2194 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2195 GLuint mask = inst->DstReg.WriteMask;
2196 int i;
2197 int mark = mark_tmps( c );
2198
2199 assert( mark == 0 );
2200
2201 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2202 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2203 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2204 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2205
2206 param0 = alloc_tmp( c );
2207 param1 = alloc_tmp( c );
2208 param2 = alloc_tmp( c );
2209 param3 = alloc_tmp( c );
2210
2211 brw_MOV( p, param0, src0 );
2212 brw_MOV( p, param1, src1 );
2213 brw_MOV( p, param2, src2 );
2214 brw_MOV( p, param3, src3 );
2215
2216 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2217
2218 /* Fill in the result: */
2219 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2220 for (i = 0 ; i < 4; i++) {
2221 if (mask & (1<<i)) {
2222 dst = get_dst_reg(c, inst, i, 1);
2223 brw_MOV( p, dst, param0 );
2224 }
2225 }
2226 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2227 brw_set_saturate( p, 0 );
2228
2229 release_tmps( c, mark );
2230 }
2231
2232 static void emit_wpos_xy(struct brw_wm_compile *c,
2233 struct prog_instruction *inst)
2234 {
2235 struct brw_compile *p = &c->func;
2236 GLuint mask = inst->DstReg.WriteMask;
2237 struct brw_reg src0[2], dst[2];
2238
2239 dst[0] = get_dst_reg(c, inst, 0, 1);
2240 dst[1] = get_dst_reg(c, inst, 1, 1);
2241
2242 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2243 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2244
2245 /* Calculate the pixel offset from window bottom left into destination
2246 * X and Y channels.
2247 */
2248 if (mask & WRITEMASK_X) {
2249 /* X' = X - origin_x */
2250 brw_ADD(p,
2251 dst[0],
2252 retype(src0[0], BRW_REGISTER_TYPE_W),
2253 brw_imm_d(0 - c->key.origin_x));
2254 }
2255
2256 if (mask & WRITEMASK_Y) {
2257 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2258 brw_ADD(p,
2259 dst[1],
2260 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2261 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2262 }
2263 }
2264
2265 /* TODO
2266 BIAS on SIMD8 not workind yet...
2267 */
2268 static void emit_txb(struct brw_wm_compile *c,
2269 struct prog_instruction *inst)
2270 {
2271 struct brw_compile *p = &c->func;
2272 struct brw_reg dst[4], src[4], payload_reg;
2273 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2274
2275 GLuint i;
2276 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2277 for (i = 0; i < 4; i++)
2278 dst[i] = get_dst_reg(c, inst, i, 1);
2279 for (i = 0; i < 4; i++)
2280 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2281
2282 switch (inst->TexSrcTarget) {
2283 case TEXTURE_1D_INDEX:
2284 brw_MOV(p, brw_message_reg(2), src[0]);
2285 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2286 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2287 break;
2288 case TEXTURE_2D_INDEX:
2289 case TEXTURE_RECT_INDEX:
2290 brw_MOV(p, brw_message_reg(2), src[0]);
2291 brw_MOV(p, brw_message_reg(3), src[1]);
2292 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2293 break;
2294 default:
2295 brw_MOV(p, brw_message_reg(2), src[0]);
2296 brw_MOV(p, brw_message_reg(3), src[1]);
2297 brw_MOV(p, brw_message_reg(4), src[2]);
2298 break;
2299 }
2300 brw_MOV(p, brw_message_reg(5), src[3]);
2301 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2302 brw_SAMPLE(p,
2303 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2304 1,
2305 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2306 unit + MAX_DRAW_BUFFERS, /* surface */
2307 unit, /* sampler */
2308 inst->DstReg.WriteMask,
2309 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2310 4,
2311 4,
2312 0);
2313 }
2314
2315 static void emit_tex(struct brw_wm_compile *c,
2316 struct prog_instruction *inst)
2317 {
2318 struct brw_compile *p = &c->func;
2319 struct brw_reg dst[4], src[4], payload_reg;
2320 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2321
2322 GLuint msg_len;
2323 GLuint i, nr;
2324 GLuint emit;
2325 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2326
2327 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2328
2329 for (i = 0; i < 4; i++)
2330 dst[i] = get_dst_reg(c, inst, i, 1);
2331 for (i = 0; i < 4; i++)
2332 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2333
2334
2335 switch (inst->TexSrcTarget) {
2336 case TEXTURE_1D_INDEX:
2337 emit = WRITEMASK_X;
2338 nr = 1;
2339 break;
2340 case TEXTURE_2D_INDEX:
2341 case TEXTURE_RECT_INDEX:
2342 emit = WRITEMASK_XY;
2343 nr = 2;
2344 break;
2345 default:
2346 emit = WRITEMASK_XYZ;
2347 nr = 3;
2348 break;
2349 }
2350 msg_len = 1;
2351
2352 for (i = 0; i < nr; i++) {
2353 static const GLuint swz[4] = {0,1,2,2};
2354 if (emit & (1<<i))
2355 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2356 else
2357 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2358 msg_len += 1;
2359 }
2360
2361 if (shadow) {
2362 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2363 brw_MOV(p, brw_message_reg(6), src[2]);
2364 }
2365
2366 brw_SAMPLE(p,
2367 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2368 1,
2369 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2370 unit + MAX_DRAW_BUFFERS, /* surface */
2371 unit, /* sampler */
2372 inst->DstReg.WriteMask,
2373 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2374 4,
2375 shadow ? 6 : 4,
2376 0);
2377
2378 if (shadow)
2379 brw_MOV(p, dst[3], brw_imm_f(1.0));
2380 }
2381
2382 /**
2383 * Resolve subroutine calls after code emit is done.
2384 */
2385 static void post_wm_emit( struct brw_wm_compile *c )
2386 {
2387 brw_resolve_cals(&c->func);
2388 }
2389
2390 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2391 {
2392 #define MAX_IFSN 32
2393 #define MAX_LOOP_DEPTH 32
2394 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2395 struct brw_instruction *inst0, *inst1;
2396 int i, if_insn = 0, loop_insn = 0;
2397 struct brw_compile *p = &c->func;
2398 struct brw_indirect stack_index = brw_indirect(0, 0);
2399
2400 c->reg_index = 0;
2401 prealloc_reg(c);
2402 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2403 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2404
2405 for (i = 0; i < c->nr_fp_insns; i++) {
2406 struct prog_instruction *inst = &c->prog_instructions[i];
2407
2408 if (inst->CondUpdate)
2409 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2410 else
2411 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2412
2413 switch (inst->Opcode) {
2414 case WM_PIXELXY:
2415 emit_pixel_xy(c, inst);
2416 break;
2417 case WM_DELTAXY:
2418 emit_delta_xy(c, inst);
2419 break;
2420 case WM_PIXELW:
2421 emit_pixel_w(c, inst);
2422 break;
2423 case WM_LINTERP:
2424 emit_linterp(c, inst);
2425 break;
2426 case WM_PINTERP:
2427 emit_pinterp(c, inst);
2428 break;
2429 case WM_CINTERP:
2430 emit_cinterp(c, inst);
2431 break;
2432 case WM_WPOSXY:
2433 emit_wpos_xy(c, inst);
2434 break;
2435 case WM_FB_WRITE:
2436 emit_fb_write(c, inst);
2437 break;
2438 case OPCODE_ABS:
2439 emit_abs(c, inst);
2440 break;
2441 case OPCODE_ADD:
2442 emit_add(c, inst);
2443 break;
2444 case OPCODE_SUB:
2445 emit_sub(c, inst);
2446 break;
2447 case OPCODE_FRC:
2448 emit_frc(c, inst);
2449 break;
2450 case OPCODE_FLR:
2451 emit_flr(c, inst);
2452 break;
2453 case OPCODE_LRP:
2454 emit_lrp(c, inst);
2455 break;
2456 case OPCODE_TRUNC:
2457 emit_trunc(c, inst);
2458 break;
2459 case OPCODE_MOV:
2460 emit_mov(c, inst);
2461 break;
2462 case OPCODE_DP3:
2463 emit_dp3(c, inst);
2464 break;
2465 case OPCODE_DP4:
2466 emit_dp4(c, inst);
2467 break;
2468 case OPCODE_XPD:
2469 emit_xpd(c, inst);
2470 break;
2471 case OPCODE_DPH:
2472 emit_dph(c, inst);
2473 break;
2474 case OPCODE_RCP:
2475 emit_rcp(c, inst);
2476 break;
2477 case OPCODE_RSQ:
2478 emit_rsq(c, inst);
2479 break;
2480 case OPCODE_SIN:
2481 emit_sin(c, inst);
2482 break;
2483 case OPCODE_COS:
2484 emit_cos(c, inst);
2485 break;
2486 case OPCODE_EX2:
2487 emit_ex2(c, inst);
2488 break;
2489 case OPCODE_LG2:
2490 emit_lg2(c, inst);
2491 break;
2492 case OPCODE_MAX:
2493 emit_max(c, inst);
2494 break;
2495 case OPCODE_MIN:
2496 emit_min(c, inst);
2497 break;
2498 case OPCODE_DDX:
2499 emit_ddx(c, inst);
2500 break;
2501 case OPCODE_DDY:
2502 emit_ddy(c, inst);
2503 break;
2504 case OPCODE_SLT:
2505 emit_slt(c, inst);
2506 break;
2507 case OPCODE_SLE:
2508 emit_sle(c, inst);
2509 break;
2510 case OPCODE_SGT:
2511 emit_sgt(c, inst);
2512 break;
2513 case OPCODE_SGE:
2514 emit_sge(c, inst);
2515 break;
2516 case OPCODE_SEQ:
2517 emit_seq(c, inst);
2518 break;
2519 case OPCODE_SNE:
2520 emit_sne(c, inst);
2521 break;
2522 case OPCODE_MUL:
2523 emit_mul(c, inst);
2524 break;
2525 case OPCODE_POW:
2526 emit_pow(c, inst);
2527 break;
2528 case OPCODE_MAD:
2529 emit_mad(c, inst);
2530 break;
2531 case OPCODE_NOISE1:
2532 emit_noise1(c, inst);
2533 break;
2534 case OPCODE_NOISE2:
2535 emit_noise2(c, inst);
2536 break;
2537 case OPCODE_NOISE3:
2538 emit_noise3(c, inst);
2539 break;
2540 case OPCODE_NOISE4:
2541 emit_noise4(c, inst);
2542 break;
2543 case OPCODE_TEX:
2544 emit_tex(c, inst);
2545 break;
2546 case OPCODE_TXB:
2547 emit_txb(c, inst);
2548 break;
2549 case OPCODE_KIL_NV:
2550 emit_kil(c);
2551 break;
2552 case OPCODE_IF:
2553 assert(if_insn < MAX_IFSN);
2554 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2555 break;
2556 case OPCODE_ELSE:
2557 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2558 break;
2559 case OPCODE_ENDIF:
2560 assert(if_insn > 0);
2561 brw_ENDIF(p, if_inst[--if_insn]);
2562 break;
2563 case OPCODE_BGNSUB:
2564 brw_save_label(p, inst->Comment, p->nr_insn);
2565 break;
2566 case OPCODE_ENDSUB:
2567 /* no-op */
2568 break;
2569 case OPCODE_CAL:
2570 brw_push_insn_state(p);
2571 brw_set_mask_control(p, BRW_MASK_DISABLE);
2572 brw_set_access_mode(p, BRW_ALIGN_1);
2573 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2574 brw_set_access_mode(p, BRW_ALIGN_16);
2575 brw_ADD(p, get_addr_reg(stack_index),
2576 get_addr_reg(stack_index), brw_imm_d(4));
2577 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2578 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2579 brw_pop_insn_state(p);
2580 break;
2581
2582 case OPCODE_RET:
2583 brw_push_insn_state(p);
2584 brw_set_mask_control(p, BRW_MASK_DISABLE);
2585 brw_ADD(p, get_addr_reg(stack_index),
2586 get_addr_reg(stack_index), brw_imm_d(-4));
2587 brw_set_access_mode(p, BRW_ALIGN_1);
2588 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2589 brw_set_access_mode(p, BRW_ALIGN_16);
2590 brw_pop_insn_state(p);
2591
2592 break;
2593 case OPCODE_BGNLOOP:
2594 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2595 break;
2596 case OPCODE_BRK:
2597 brw_BREAK(p);
2598 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2599 break;
2600 case OPCODE_CONT:
2601 brw_CONT(p);
2602 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2603 break;
2604 case OPCODE_ENDLOOP:
2605 loop_insn--;
2606 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2607 /* patch all the BREAK instructions from
2608 last BEGINLOOP */
2609 while (inst0 > loop_inst[loop_insn]) {
2610 inst0--;
2611 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2612 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2613 inst0->bits3.if_else.pop_count = 0;
2614 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2615 inst0->bits3.if_else.jump_count = inst1 - inst0;
2616 inst0->bits3.if_else.pop_count = 0;
2617 }
2618 }
2619 break;
2620 default:
2621 _mesa_printf("unsupported IR in fragment shader %d\n",
2622 inst->Opcode);
2623 }
2624 if (inst->CondUpdate)
2625 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2626 else
2627 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2628 }
2629 post_wm_emit(c);
2630
2631 if (c->reg_index >= BRW_WM_MAX_GRF) {
2632 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2633 /* XXX we need to do some proper error recovery here */
2634 }
2635 }
2636
2637
2638 /**
2639 * Do GPU code generation for shaders that use GLSL features such as
2640 * flow control. Other shaders will be compiled with the
2641 */
2642 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2643 {
2644 if (INTEL_DEBUG & DEBUG_WM) {
2645 _mesa_printf("brw_wm_glsl_emit:\n");
2646 }
2647
2648 /* initial instruction translation/simplification */
2649 brw_wm_pass_fp(c);
2650
2651 /* actual code generation */
2652 brw_wm_emit_glsl(brw, c);
2653
2654 if (INTEL_DEBUG & DEBUG_WM) {
2655 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2656 }
2657
2658 c->prog_data.total_grf = c->reg_index;
2659 c->prog_data.total_scratch = 0;
2660 }