i965: more reformatting/clean-up
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11 /* Only guess, need a flag in gl_fragment_program later */
12 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
13 {
14 int i;
15 for (i = 0; i < fp->Base.NumInstructions; i++) {
16 struct prog_instruction *inst = &fp->Base.Instructions[i];
17 switch (inst->Opcode) {
18 case OPCODE_IF:
19 case OPCODE_TRUNC:
20 case OPCODE_ENDIF:
21 case OPCODE_CAL:
22 case OPCODE_BRK:
23 case OPCODE_RET:
24 case OPCODE_DDX:
25 case OPCODE_DDY:
26 case OPCODE_NOISE1:
27 case OPCODE_NOISE2:
28 case OPCODE_NOISE3:
29 case OPCODE_NOISE4:
30 case OPCODE_BGNLOOP:
31 return GL_TRUE;
32 default:
33 break;
34 }
35 }
36 return GL_FALSE;
37 }
38
39 static void set_reg(struct brw_wm_compile *c, int file, int index,
40 int component, struct brw_reg reg)
41 {
42 c->wm_regs[file][index][component].reg = reg;
43 c->wm_regs[file][index][component].inited = GL_TRUE;
44 }
45
46 static int get_scalar_dst_index(struct prog_instruction *inst)
47 {
48 int i;
49 for (i = 0; i < 4; i++)
50 if (inst->DstReg.WriteMask & (1<<i))
51 break;
52 return i;
53 }
54
55 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
56 {
57 struct brw_reg reg;
58 if(c->tmp_index == c->tmp_max)
59 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
60
61 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
62 return reg;
63 }
64
65 static int mark_tmps(struct brw_wm_compile *c)
66 {
67 return c->tmp_index;
68 }
69
70 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
71 {
72 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
73 }
74
75 static void release_tmps(struct brw_wm_compile *c, int mark)
76 {
77 c->tmp_index = mark;
78 }
79
80 static struct brw_reg
81 get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
82 {
83 struct brw_reg reg;
84 switch (file) {
85 case PROGRAM_STATE_VAR:
86 case PROGRAM_CONSTANT:
87 case PROGRAM_UNIFORM:
88 file = PROGRAM_STATE_VAR;
89 break;
90 case PROGRAM_UNDEFINED:
91 return brw_null_reg();
92 default:
93 break;
94 }
95
96 if(c->wm_regs[file][index][component].inited)
97 reg = c->wm_regs[file][index][component].reg;
98 else
99 reg = brw_vec8_grf(c->reg_index, 0);
100
101 if(!c->wm_regs[file][index][component].inited) {
102 set_reg(c, file, index, component, reg);
103 c->reg_index++;
104 }
105
106 if (neg & (1<< component)) {
107 reg = negate(reg);
108 }
109 if (abs)
110 reg = brw_abs(reg);
111 return reg;
112 }
113
114 static void prealloc_reg(struct brw_wm_compile *c)
115 {
116 int i, j;
117 struct brw_reg reg;
118 int nr_interp_regs = 0;
119 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
120
121 for (i = 0; i < 4; i++) {
122 reg = (i < c->key.nr_depth_regs)
123 ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
124 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
125 }
126 c->reg_index += 2*c->key.nr_depth_regs;
127 {
128 int nr_params = c->fp->program.Base.Parameters->NumParameters;
129 struct gl_program_parameter_list *plist =
130 c->fp->program.Base.Parameters;
131 int index = 0;
132 c->prog_data.nr_params = 4*nr_params;
133 for (i = 0; i < nr_params; i++) {
134 for (j = 0; j < 4; j++, index++) {
135 reg = brw_vec1_grf(c->reg_index + index/8,
136 index%8);
137 c->prog_data.param[index] =
138 &plist->ParameterValues[i][j];
139 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
140 }
141 }
142 c->nr_creg = 2*((4*nr_params+15)/16);
143 c->reg_index += c->nr_creg;
144 }
145 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
146 if (inputs & (1<<i)) {
147 nr_interp_regs++;
148 reg = brw_vec8_grf(c->reg_index, 0);
149 for (j = 0; j < 4; j++)
150 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
151 c->reg_index += 2;
152
153 }
154 }
155 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
156 c->prog_data.urb_read_length = nr_interp_regs * 2;
157 c->prog_data.curb_read_length = c->nr_creg;
158 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
159 c->reg_index++;
160 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
161 c->reg_index += 2;
162 }
163
164 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
165 struct prog_instruction *inst, int component, int nr)
166 {
167 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
168 0, 0);
169 }
170
171 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
172 struct prog_src_register *src, int index, int nr)
173 {
174 int component = GET_SWZ(src->Swizzle, index);
175 return get_reg(c, src->File, src->Index, component, nr,
176 src->NegateBase, src->Abs);
177 }
178
179 /* Subroutines are minimal support for resusable instruction sequences.
180 They are implemented as simply as possible to minimise overhead: there
181 is no explicit support for communication between the caller and callee
182 other than saving the return address in a temporary register, nor is
183 there any automatic local storage. This implies that great care is
184 required before attempting reentrancy or any kind of nested
185 subroutine invocations. */
186 static void invoke_subroutine( struct brw_wm_compile *c,
187 enum _subroutine subroutine,
188 void (*emit)( struct brw_wm_compile * ) )
189 {
190 struct brw_compile *p = &c->func;
191
192 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
193
194 if( c->subroutines[ subroutine ] ) {
195 /* subroutine previously emitted: reuse existing instructions */
196
197 int mark = mark_tmps( c );
198 struct brw_reg return_address = retype( alloc_tmp( c ),
199 BRW_REGISTER_TYPE_UD );
200 int here = p->nr_insn;
201
202 brw_push_insn_state(p);
203 brw_set_mask_control(p, BRW_MASK_DISABLE);
204 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
205
206 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
207 brw_imm_d( ( c->subroutines[ subroutine ] -
208 here - 1 ) << 4 ) );
209 brw_pop_insn_state(p);
210
211 release_tmps( c, mark );
212 } else {
213 /* previously unused subroutine: emit, and mark for later reuse */
214
215 int mark = mark_tmps( c );
216 struct brw_reg return_address = retype( alloc_tmp( c ),
217 BRW_REGISTER_TYPE_UD );
218 struct brw_instruction *calc;
219 int base = p->nr_insn;
220
221 brw_push_insn_state(p);
222 brw_set_mask_control(p, BRW_MASK_DISABLE);
223 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
224 brw_pop_insn_state(p);
225
226 c->subroutines[ subroutine ] = p->nr_insn;
227
228 emit( c );
229
230 brw_push_insn_state(p);
231 brw_set_mask_control(p, BRW_MASK_DISABLE);
232 brw_MOV( p, brw_ip_reg(), return_address );
233 brw_pop_insn_state(p);
234
235 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
236
237 release_tmps( c, mark );
238 }
239 }
240
241 static void emit_abs( struct brw_wm_compile *c,
242 struct prog_instruction *inst)
243 {
244 int i;
245 struct brw_compile *p = &c->func;
246 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
247 for (i = 0; i < 4; i++) {
248 if (inst->DstReg.WriteMask & (1<<i)) {
249 struct brw_reg src, dst;
250 dst = get_dst_reg(c, inst, i, 1);
251 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
252 brw_MOV(p, dst, brw_abs(src));
253 }
254 }
255 brw_set_saturate(p, 0);
256 }
257
258 static void emit_trunc( struct brw_wm_compile *c,
259 struct prog_instruction *inst)
260 {
261 int i;
262 struct brw_compile *p = &c->func;
263 GLuint mask = inst->DstReg.WriteMask;
264 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
265 for (i = 0; i < 4; i++) {
266 if (mask & (1<<i)) {
267 struct brw_reg src, dst;
268 dst = get_dst_reg(c, inst, i, 1) ;
269 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
270 brw_RNDZ(p, dst, src);
271 }
272 }
273 brw_set_saturate(p, 0);
274 }
275
276 static void emit_mov( struct brw_wm_compile *c,
277 struct prog_instruction *inst)
278 {
279 int i;
280 struct brw_compile *p = &c->func;
281 GLuint mask = inst->DstReg.WriteMask;
282 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
283 for (i = 0; i < 4; i++) {
284 if (mask & (1<<i)) {
285 struct brw_reg src, dst;
286 dst = get_dst_reg(c, inst, i, 1);
287 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
288 brw_MOV(p, dst, src);
289 }
290 }
291 brw_set_saturate(p, 0);
292 }
293
294 static void emit_pixel_xy(struct brw_wm_compile *c,
295 struct prog_instruction *inst)
296 {
297 struct brw_reg r1 = brw_vec1_grf(1, 0);
298 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
299
300 struct brw_reg dst0, dst1;
301 struct brw_compile *p = &c->func;
302 GLuint mask = inst->DstReg.WriteMask;
303
304 dst0 = get_dst_reg(c, inst, 0, 1);
305 dst1 = get_dst_reg(c, inst, 1, 1);
306 /* Calculate pixel centers by adding 1 or 0 to each of the
307 * micro-tile coordinates passed in r1.
308 */
309 if (mask & WRITEMASK_X) {
310 brw_ADD(p,
311 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
312 stride(suboffset(r1_uw, 4), 2, 4, 0),
313 brw_imm_v(0x10101010));
314 }
315
316 if (mask & WRITEMASK_Y) {
317 brw_ADD(p,
318 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
319 stride(suboffset(r1_uw, 5), 2, 4, 0),
320 brw_imm_v(0x11001100));
321 }
322 }
323
324 static void emit_delta_xy(struct brw_wm_compile *c,
325 struct prog_instruction *inst)
326 {
327 struct brw_reg r1 = brw_vec1_grf(1, 0);
328 struct brw_reg dst0, dst1, src0, src1;
329 struct brw_compile *p = &c->func;
330 GLuint mask = inst->DstReg.WriteMask;
331
332 dst0 = get_dst_reg(c, inst, 0, 1);
333 dst1 = get_dst_reg(c, inst, 1, 1);
334 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
335 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
336 /* Calc delta X,Y by subtracting origin in r1 from the pixel
337 * centers.
338 */
339 if (mask & WRITEMASK_X) {
340 brw_ADD(p,
341 dst0,
342 retype(src0, BRW_REGISTER_TYPE_UW),
343 negate(r1));
344 }
345
346 if (mask & WRITEMASK_Y) {
347 brw_ADD(p,
348 dst1,
349 retype(src1, BRW_REGISTER_TYPE_UW),
350 negate(suboffset(r1,1)));
351
352 }
353 }
354
355 static void fire_fb_write( struct brw_wm_compile *c,
356 GLuint base_reg,
357 GLuint nr,
358 GLuint target,
359 GLuint eot)
360 {
361 struct brw_compile *p = &c->func;
362 /* Pass through control information:
363 */
364 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
365 {
366 brw_push_insn_state(p);
367 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
368 brw_MOV(p,
369 brw_message_reg(base_reg + 1),
370 brw_vec8_grf(1, 0));
371 brw_pop_insn_state(p);
372 }
373 /* Send framebuffer write message: */
374 brw_fb_WRITE(p,
375 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
376 base_reg,
377 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
378 target,
379 nr,
380 0,
381 eot);
382 }
383
384 static void emit_fb_write(struct brw_wm_compile *c,
385 struct prog_instruction *inst)
386 {
387 struct brw_compile *p = &c->func;
388 int nr = 2;
389 int channel;
390 GLuint target, eot;
391 struct brw_reg src0;
392
393 /* Reserve a space for AA - may not be needed:
394 */
395 if (c->key.aa_dest_stencil_reg)
396 nr += 1;
397
398 brw_push_insn_state(p);
399 for (channel = 0; channel < 4; channel++) {
400 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
401 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
402 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
403 brw_MOV(p, brw_message_reg(nr + channel), src0);
404 }
405 /* skip over the regs populated above: */
406 nr += 8;
407 brw_pop_insn_state(p);
408
409 if (c->key.source_depth_to_render_target) {
410 if (c->key.computes_depth) {
411 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
412 brw_MOV(p, brw_message_reg(nr), src0);
413 }
414 else {
415 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
416 brw_MOV(p, brw_message_reg(nr), src0);
417 }
418
419 nr += 2;
420 }
421
422 target = inst->Sampler >> 1;
423 eot = inst->Sampler & 1;
424 fire_fb_write(c, 0, nr, target, eot);
425 }
426
427 static void emit_pixel_w( struct brw_wm_compile *c,
428 struct prog_instruction *inst)
429 {
430 struct brw_compile *p = &c->func;
431 GLuint mask = inst->DstReg.WriteMask;
432 if (mask & WRITEMASK_W) {
433 struct brw_reg dst, src0, delta0, delta1;
434 struct brw_reg interp3;
435
436 dst = get_dst_reg(c, inst, 3, 1);
437 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
438 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
439 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
440
441 interp3 = brw_vec1_grf(src0.nr+1, 4);
442 /* Calc 1/w - just linterp wpos[3] optimized by putting the
443 * result straight into a message reg.
444 */
445 brw_LINE(p, brw_null_reg(), interp3, delta0);
446 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
447
448 /* Calc w */
449 brw_math_16( p, dst,
450 BRW_MATH_FUNCTION_INV,
451 BRW_MATH_SATURATE_NONE,
452 2, brw_null_reg(),
453 BRW_MATH_PRECISION_FULL);
454 }
455 }
456
457 static void emit_linterp(struct brw_wm_compile *c,
458 struct prog_instruction *inst)
459 {
460 struct brw_compile *p = &c->func;
461 GLuint mask = inst->DstReg.WriteMask;
462 struct brw_reg interp[4];
463 struct brw_reg dst, delta0, delta1;
464 struct brw_reg src0;
465
466 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
467 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
468 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
469 GLuint nr = src0.nr;
470 int i;
471
472 interp[0] = brw_vec1_grf(nr, 0);
473 interp[1] = brw_vec1_grf(nr, 4);
474 interp[2] = brw_vec1_grf(nr+1, 0);
475 interp[3] = brw_vec1_grf(nr+1, 4);
476
477 for(i = 0; i < 4; i++ ) {
478 if (mask & (1<<i)) {
479 dst = get_dst_reg(c, inst, i, 1);
480 brw_LINE(p, brw_null_reg(), interp[i], delta0);
481 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
482 }
483 }
484 }
485
486 static void emit_cinterp(struct brw_wm_compile *c,
487 struct prog_instruction *inst)
488 {
489 struct brw_compile *p = &c->func;
490 GLuint mask = inst->DstReg.WriteMask;
491
492 struct brw_reg interp[4];
493 struct brw_reg dst, src0;
494
495 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
496 GLuint nr = src0.nr;
497 int i;
498
499 interp[0] = brw_vec1_grf(nr, 0);
500 interp[1] = brw_vec1_grf(nr, 4);
501 interp[2] = brw_vec1_grf(nr+1, 0);
502 interp[3] = brw_vec1_grf(nr+1, 4);
503
504 for(i = 0; i < 4; i++ ) {
505 if (mask & (1<<i)) {
506 dst = get_dst_reg(c, inst, i, 1);
507 brw_MOV(p, dst, suboffset(interp[i],3));
508 }
509 }
510 }
511
512 static void emit_pinterp(struct brw_wm_compile *c,
513 struct prog_instruction *inst)
514 {
515 struct brw_compile *p = &c->func;
516 GLuint mask = inst->DstReg.WriteMask;
517
518 struct brw_reg interp[4];
519 struct brw_reg dst, delta0, delta1;
520 struct brw_reg src0, w;
521
522 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
523 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
524 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
525 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
526 GLuint nr = src0.nr;
527 int i;
528
529 interp[0] = brw_vec1_grf(nr, 0);
530 interp[1] = brw_vec1_grf(nr, 4);
531 interp[2] = brw_vec1_grf(nr+1, 0);
532 interp[3] = brw_vec1_grf(nr+1, 4);
533
534 for(i = 0; i < 4; i++ ) {
535 if (mask & (1<<i)) {
536 dst = get_dst_reg(c, inst, i, 1);
537 brw_LINE(p, brw_null_reg(), interp[i], delta0);
538 brw_MAC(p, dst, suboffset(interp[i],1),
539 delta1);
540 brw_MUL(p, dst, dst, w);
541 }
542 }
543 }
544
545 static void emit_xpd(struct brw_wm_compile *c,
546 struct prog_instruction *inst)
547 {
548 int i;
549 struct brw_compile *p = &c->func;
550 GLuint mask = inst->DstReg.WriteMask;
551 for (i = 0; i < 4; i++) {
552 GLuint i2 = (i+2)%3;
553 GLuint i1 = (i+1)%3;
554 if (mask & (1<<i)) {
555 struct brw_reg src0, src1, dst;
556 dst = get_dst_reg(c, inst, i, 1);
557 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
558 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
559 brw_MUL(p, brw_null_reg(), src0, src1);
560 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
561 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
562 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
563 brw_MAC(p, dst, src0, src1);
564 brw_set_saturate(p, 0);
565 }
566 }
567 brw_set_saturate(p, 0);
568 }
569
570 static void emit_dp3(struct brw_wm_compile *c,
571 struct prog_instruction *inst)
572 {
573 struct brw_reg src0[3], src1[3], dst;
574 int i;
575 struct brw_compile *p = &c->func;
576 for (i = 0; i < 3; i++) {
577 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
578 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
579 }
580
581 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
582 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
583 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
584 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
585 brw_MAC(p, dst, src0[2], src1[2]);
586 brw_set_saturate(p, 0);
587 }
588
589 static void emit_dp4(struct brw_wm_compile *c,
590 struct prog_instruction *inst)
591 {
592 struct brw_reg src0[4], src1[4], dst;
593 int i;
594 struct brw_compile *p = &c->func;
595 for (i = 0; i < 4; i++) {
596 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
597 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
598 }
599 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
600 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
601 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
602 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
603 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
604 brw_MAC(p, dst, src0[3], src1[3]);
605 brw_set_saturate(p, 0);
606 }
607
608 static void emit_dph(struct brw_wm_compile *c,
609 struct prog_instruction *inst)
610 {
611 struct brw_reg src0[4], src1[4], dst;
612 int i;
613 struct brw_compile *p = &c->func;
614 for (i = 0; i < 4; i++) {
615 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
616 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
617 }
618 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
619 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
620 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
621 brw_MAC(p, dst, src0[2], src1[2]);
622 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
623 brw_ADD(p, dst, dst, src1[3]);
624 brw_set_saturate(p, 0);
625 }
626
627 static void emit_math1(struct brw_wm_compile *c,
628 struct prog_instruction *inst, GLuint func)
629 {
630 struct brw_compile *p = &c->func;
631 struct brw_reg src0, dst;
632
633 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
634 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
635 brw_MOV(p, brw_message_reg(2), src0);
636 brw_math(p,
637 dst,
638 func,
639 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
640 2,
641 brw_null_reg(),
642 BRW_MATH_DATA_VECTOR,
643 BRW_MATH_PRECISION_FULL);
644 }
645
646 static void emit_rcp(struct brw_wm_compile *c,
647 struct prog_instruction *inst)
648 {
649 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
650 }
651
652 static void emit_rsq(struct brw_wm_compile *c,
653 struct prog_instruction *inst)
654 {
655 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
656 }
657
658 static void emit_sin(struct brw_wm_compile *c,
659 struct prog_instruction *inst)
660 {
661 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
662 }
663
664 static void emit_cos(struct brw_wm_compile *c,
665 struct prog_instruction *inst)
666 {
667 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
668 }
669
670 static void emit_ex2(struct brw_wm_compile *c,
671 struct prog_instruction *inst)
672 {
673 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
674 }
675
676 static void emit_lg2(struct brw_wm_compile *c,
677 struct prog_instruction *inst)
678 {
679 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
680 }
681
682 static void emit_add(struct brw_wm_compile *c,
683 struct prog_instruction *inst)
684 {
685 struct brw_compile *p = &c->func;
686 struct brw_reg src0, src1, dst;
687 GLuint mask = inst->DstReg.WriteMask;
688 int i;
689 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
690 for (i = 0 ; i < 4; i++) {
691 if (mask & (1<<i)) {
692 dst = get_dst_reg(c, inst, i, 1);
693 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
694 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
695 brw_ADD(p, dst, src0, src1);
696 }
697 }
698 brw_set_saturate(p, 0);
699 }
700
701 static void emit_sub(struct brw_wm_compile *c,
702 struct prog_instruction *inst)
703 {
704 struct brw_compile *p = &c->func;
705 struct brw_reg src0, src1, dst;
706 GLuint mask = inst->DstReg.WriteMask;
707 int i;
708 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
709 for (i = 0 ; i < 4; i++) {
710 if (mask & (1<<i)) {
711 dst = get_dst_reg(c, inst, i, 1);
712 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
713 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
714 brw_ADD(p, dst, src0, negate(src1));
715 }
716 }
717 brw_set_saturate(p, 0);
718 }
719
720 static void emit_mul(struct brw_wm_compile *c,
721 struct prog_instruction *inst)
722 {
723 struct brw_compile *p = &c->func;
724 struct brw_reg src0, src1, dst;
725 GLuint mask = inst->DstReg.WriteMask;
726 int i;
727 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
728 for (i = 0 ; i < 4; i++) {
729 if (mask & (1<<i)) {
730 dst = get_dst_reg(c, inst, i, 1);
731 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
732 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
733 brw_MUL(p, dst, src0, src1);
734 }
735 }
736 brw_set_saturate(p, 0);
737 }
738
739 static void emit_frc(struct brw_wm_compile *c,
740 struct prog_instruction *inst)
741 {
742 struct brw_compile *p = &c->func;
743 struct brw_reg src0, dst;
744 GLuint mask = inst->DstReg.WriteMask;
745 int i;
746 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
747 for (i = 0 ; i < 4; i++) {
748 if (mask & (1<<i)) {
749 dst = get_dst_reg(c, inst, i, 1);
750 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
751 brw_FRC(p, dst, src0);
752 }
753 }
754 if (inst->SaturateMode != SATURATE_OFF)
755 brw_set_saturate(p, 0);
756 }
757
758 static void emit_flr(struct brw_wm_compile *c,
759 struct prog_instruction *inst)
760 {
761 struct brw_compile *p = &c->func;
762 struct brw_reg src0, dst;
763 GLuint mask = inst->DstReg.WriteMask;
764 int i;
765 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
766 for (i = 0 ; i < 4; i++) {
767 if (mask & (1<<i)) {
768 dst = get_dst_reg(c, inst, i, 1);
769 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
770 brw_RNDD(p, dst, src0);
771 }
772 }
773 brw_set_saturate(p, 0);
774 }
775
776 static void emit_max(struct brw_wm_compile *c,
777 struct prog_instruction *inst)
778 {
779 struct brw_compile *p = &c->func;
780 GLuint mask = inst->DstReg.WriteMask;
781 struct brw_reg src0, src1, dst;
782 int i;
783 brw_push_insn_state(p);
784 for (i = 0; i < 4; i++) {
785 if (mask & (1<<i)) {
786 dst = get_dst_reg(c, inst, i, 1);
787 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
788 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
789 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
790 brw_MOV(p, dst, src0);
791 brw_set_saturate(p, 0);
792
793 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
794 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
795 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
796 brw_MOV(p, dst, src1);
797 brw_set_saturate(p, 0);
798 brw_set_predicate_control_flag_value(p, 0xff);
799 }
800 }
801 brw_pop_insn_state(p);
802 }
803
804 static void emit_min(struct brw_wm_compile *c,
805 struct prog_instruction *inst)
806 {
807 struct brw_compile *p = &c->func;
808 GLuint mask = inst->DstReg.WriteMask;
809 struct brw_reg src0, src1, dst;
810 int i;
811 brw_push_insn_state(p);
812 for (i = 0; i < 4; i++) {
813 if (mask & (1<<i)) {
814 dst = get_dst_reg(c, inst, i, 1);
815 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
816 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
817 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
818 brw_MOV(p, dst, src0);
819 brw_set_saturate(p, 0);
820
821 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
822 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
823 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
824 brw_MOV(p, dst, src1);
825 brw_set_saturate(p, 0);
826 brw_set_predicate_control_flag_value(p, 0xff);
827 }
828 }
829 brw_pop_insn_state(p);
830 }
831
832 static void emit_pow(struct brw_wm_compile *c,
833 struct prog_instruction *inst)
834 {
835 struct brw_compile *p = &c->func;
836 struct brw_reg dst, src0, src1;
837 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
838 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
839 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
840
841 brw_MOV(p, brw_message_reg(2), src0);
842 brw_MOV(p, brw_message_reg(3), src1);
843
844 brw_math(p,
845 dst,
846 BRW_MATH_FUNCTION_POW,
847 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
848 2,
849 brw_null_reg(),
850 BRW_MATH_DATA_VECTOR,
851 BRW_MATH_PRECISION_FULL);
852 }
853
854 static void emit_lrp(struct brw_wm_compile *c,
855 struct prog_instruction *inst)
856 {
857 struct brw_compile *p = &c->func;
858 GLuint mask = inst->DstReg.WriteMask;
859 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
860 int i;
861 int mark = mark_tmps(c);
862 for (i = 0; i < 4; i++) {
863 if (mask & (1<<i)) {
864 dst = get_dst_reg(c, inst, i, 1);
865 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
866
867 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
868
869 if (src1.nr == dst.nr) {
870 tmp1 = alloc_tmp(c);
871 brw_MOV(p, tmp1, src1);
872 } else
873 tmp1 = src1;
874
875 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
876 if (src2.nr == dst.nr) {
877 tmp2 = alloc_tmp(c);
878 brw_MOV(p, tmp2, src2);
879 } else
880 tmp2 = src2;
881
882 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
883 brw_MUL(p, brw_null_reg(), dst, tmp2);
884 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
885 brw_MAC(p, dst, src0, tmp1);
886 brw_set_saturate(p, 0);
887 }
888 release_tmps(c, mark);
889 }
890 }
891
892 /**
893 * For GLSL shaders, this KIL will be unconditional.
894 * It may be contained inside an IF/ENDIF structure of course.
895 */
896 static void emit_kil(struct brw_wm_compile *c)
897 {
898 struct brw_compile *p = &c->func;
899 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
900 brw_push_insn_state(p);
901 brw_set_mask_control(p, BRW_MASK_DISABLE);
902 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
903 brw_AND(p, depth, c->emit_mask_reg, depth);
904 brw_pop_insn_state(p);
905 }
906
907 static void emit_mad(struct brw_wm_compile *c,
908 struct prog_instruction *inst)
909 {
910 struct brw_compile *p = &c->func;
911 GLuint mask = inst->DstReg.WriteMask;
912 struct brw_reg dst, src0, src1, src2;
913 int i;
914
915 for (i = 0; i < 4; i++) {
916 if (mask & (1<<i)) {
917 dst = get_dst_reg(c, inst, i, 1);
918 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
919 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
920 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
921 brw_MUL(p, dst, src0, src1);
922
923 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
924 brw_ADD(p, dst, dst, src2);
925 brw_set_saturate(p, 0);
926 }
927 }
928 }
929
930 static void emit_sop(struct brw_wm_compile *c,
931 struct prog_instruction *inst, GLuint cond)
932 {
933 struct brw_compile *p = &c->func;
934 GLuint mask = inst->DstReg.WriteMask;
935 struct brw_reg dst, src0, src1;
936 int i;
937
938 for (i = 0; i < 4; i++) {
939 if (mask & (1<<i)) {
940 dst = get_dst_reg(c, inst, i, 1);
941 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
942 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
943 brw_push_insn_state(p);
944 brw_CMP(p, brw_null_reg(), cond, src0, src1);
945 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
946 brw_MOV(p, dst, brw_imm_f(0.0));
947 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
948 brw_MOV(p, dst, brw_imm_f(1.0));
949 brw_pop_insn_state(p);
950 }
951 }
952 }
953
954 static void emit_slt(struct brw_wm_compile *c,
955 struct prog_instruction *inst)
956 {
957 emit_sop(c, inst, BRW_CONDITIONAL_L);
958 }
959
960 static void emit_sle(struct brw_wm_compile *c,
961 struct prog_instruction *inst)
962 {
963 emit_sop(c, inst, BRW_CONDITIONAL_LE);
964 }
965
966 static void emit_sgt(struct brw_wm_compile *c,
967 struct prog_instruction *inst)
968 {
969 emit_sop(c, inst, BRW_CONDITIONAL_G);
970 }
971
972 static void emit_sge(struct brw_wm_compile *c,
973 struct prog_instruction *inst)
974 {
975 emit_sop(c, inst, BRW_CONDITIONAL_GE);
976 }
977
978 static void emit_seq(struct brw_wm_compile *c,
979 struct prog_instruction *inst)
980 {
981 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
982 }
983
984 static void emit_sne(struct brw_wm_compile *c,
985 struct prog_instruction *inst)
986 {
987 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
988 }
989
990 static void emit_ddx(struct brw_wm_compile *c,
991 struct prog_instruction *inst)
992 {
993 struct brw_compile *p = &c->func;
994 GLuint mask = inst->DstReg.WriteMask;
995 struct brw_reg interp[4];
996 struct brw_reg dst;
997 struct brw_reg src0, w;
998 GLuint nr, i;
999 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1000 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1001 nr = src0.nr;
1002 interp[0] = brw_vec1_grf(nr, 0);
1003 interp[1] = brw_vec1_grf(nr, 4);
1004 interp[2] = brw_vec1_grf(nr+1, 0);
1005 interp[3] = brw_vec1_grf(nr+1, 4);
1006 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1007 for(i = 0; i < 4; i++ ) {
1008 if (mask & (1<<i)) {
1009 dst = get_dst_reg(c, inst, i, 1);
1010 brw_MOV(p, dst, interp[i]);
1011 brw_MUL(p, dst, dst, w);
1012 }
1013 }
1014 brw_set_saturate(p, 0);
1015 }
1016
1017 static void emit_ddy(struct brw_wm_compile *c,
1018 struct prog_instruction *inst)
1019 {
1020 struct brw_compile *p = &c->func;
1021 GLuint mask = inst->DstReg.WriteMask;
1022 struct brw_reg interp[4];
1023 struct brw_reg dst;
1024 struct brw_reg src0, w;
1025 GLuint nr, i;
1026
1027 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1028 nr = src0.nr;
1029 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1030 interp[0] = brw_vec1_grf(nr, 0);
1031 interp[1] = brw_vec1_grf(nr, 4);
1032 interp[2] = brw_vec1_grf(nr+1, 0);
1033 interp[3] = brw_vec1_grf(nr+1, 4);
1034 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1035 for(i = 0; i < 4; i++ ) {
1036 if (mask & (1<<i)) {
1037 dst = get_dst_reg(c, inst, i, 1);
1038 brw_MOV(p, dst, suboffset(interp[i], 1));
1039 brw_MUL(p, dst, dst, w);
1040 }
1041 }
1042 brw_set_saturate(p, 0);
1043 }
1044
1045 static INLINE struct brw_reg high_words( struct brw_reg reg )
1046 {
1047 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1048 0, 8, 2 );
1049 }
1050
1051 static INLINE struct brw_reg low_words( struct brw_reg reg )
1052 {
1053 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1054 }
1055
1056 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1057 {
1058 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1059 }
1060
1061 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1062 {
1063 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1064 0, 16, 2 );
1065 }
1066
1067 /* One-, two- and three-dimensional Perlin noise, similar to the description
1068 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1069 static void noise1_sub( struct brw_wm_compile *c ) {
1070
1071 struct brw_compile *p = &c->func;
1072 struct brw_reg param,
1073 x0, x1, /* gradients at each end */
1074 t, tmp[ 2 ], /* float temporaries */
1075 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1076 int i;
1077 int mark = mark_tmps( c );
1078
1079 x0 = alloc_tmp( c );
1080 x1 = alloc_tmp( c );
1081 t = alloc_tmp( c );
1082 tmp[ 0 ] = alloc_tmp( c );
1083 tmp[ 1 ] = alloc_tmp( c );
1084 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1085 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1086 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1087 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1088 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1089
1090 param = lookup_tmp( c, mark - 2 );
1091
1092 brw_set_access_mode( p, BRW_ALIGN_1 );
1093
1094 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1095
1096 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1097 be hashed. Also compute the remainder (offset within the unit
1098 length), interleaved to reduce register dependency penalties. */
1099 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1100 brw_FRC( p, param, param );
1101 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1102 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1103 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1104
1105 /* We're now ready to perform the hashing. The two hashes are
1106 interleaved for performance. The hash function used is
1107 designed to rapidly achieve avalanche and require only 32x16
1108 bit multiplication, and 16-bit swizzles (which we get for
1109 free). We can't use immediate operands in the multiplies,
1110 because immediates are permitted only in src1 and the 16-bit
1111 factor is permitted only in src0. */
1112 for( i = 0; i < 2; i++ )
1113 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1114 for( i = 0; i < 2; i++ )
1115 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1116 high_words( itmp[ i ] ) );
1117 for( i = 0; i < 2; i++ )
1118 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1119 for( i = 0; i < 2; i++ )
1120 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1121 high_words( itmp[ i ] ) );
1122 for( i = 0; i < 2; i++ )
1123 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1124 for( i = 0; i < 2; i++ )
1125 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1126 high_words( itmp[ i ] ) );
1127
1128 /* Now we want to initialise the two gradients based on the
1129 hashes. Format conversion from signed integer to float leaves
1130 everything scaled too high by a factor of pow( 2, 31 ), but
1131 we correct for that right at the end. */
1132 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1133 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1134 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1135
1136 brw_MUL( p, x0, x0, param );
1137 brw_MUL( p, x1, x1, t );
1138
1139 /* We interpolate between the gradients using the polynomial
1140 6t^5 - 15t^4 + 10t^3 (Perlin). */
1141 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1142 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1143 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1144 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1145 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1146 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1147 pipeline */
1148 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1149 brw_MUL( p, param, tmp[ 0 ], param );
1150 brw_MUL( p, x1, x1, param );
1151 brw_ADD( p, x0, x0, x1 );
1152 /* scale by pow( 2, -30 ), to compensate for the format conversion
1153 above and an extra factor of 2 so that a single gradient covers
1154 the [-1,1] range */
1155 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1156
1157 release_tmps( c, mark );
1158 }
1159
1160 static void emit_noise1( struct brw_wm_compile *c,
1161 struct prog_instruction *inst )
1162 {
1163 struct brw_compile *p = &c->func;
1164 struct brw_reg src, param, dst;
1165 GLuint mask = inst->DstReg.WriteMask;
1166 int i;
1167 int mark = mark_tmps( c );
1168
1169 assert( mark == 0 );
1170
1171 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1172
1173 param = alloc_tmp( c );
1174
1175 brw_MOV( p, param, src );
1176
1177 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1178
1179 /* Fill in the result: */
1180 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1181 for (i = 0 ; i < 4; i++) {
1182 if (mask & (1<<i)) {
1183 dst = get_dst_reg(c, inst, i, 1);
1184 brw_MOV( p, dst, param );
1185 }
1186 }
1187 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1188 brw_set_saturate( p, 0 );
1189
1190 release_tmps( c, mark );
1191 }
1192
1193 static void noise2_sub( struct brw_wm_compile *c ) {
1194
1195 struct brw_compile *p = &c->func;
1196 struct brw_reg param0, param1,
1197 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1198 t, tmp[ 4 ], /* float temporaries */
1199 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1200 int i;
1201 int mark = mark_tmps( c );
1202
1203 x0y0 = alloc_tmp( c );
1204 x0y1 = alloc_tmp( c );
1205 x1y0 = alloc_tmp( c );
1206 x1y1 = alloc_tmp( c );
1207 t = alloc_tmp( c );
1208 for( i = 0; i < 4; i++ ) {
1209 tmp[ i ] = alloc_tmp( c );
1210 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1211 }
1212 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1213 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1214 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1215
1216 param0 = lookup_tmp( c, mark - 3 );
1217 param1 = lookup_tmp( c, mark - 2 );
1218
1219 brw_set_access_mode( p, BRW_ALIGN_1 );
1220
1221 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1222 be hashed. Also compute the remainders (offsets within the unit
1223 square), interleaved to reduce register dependency penalties. */
1224 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1225 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1226 brw_FRC( p, param0, param0 );
1227 brw_FRC( p, param1, param1 );
1228 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1229 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1230 low_words( itmp[ 1 ] ) );
1231 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1232 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1233 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1234 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1235 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1236
1237 /* We're now ready to perform the hashing. The four hashes are
1238 interleaved for performance. The hash function used is
1239 designed to rapidly achieve avalanche and require only 32x16
1240 bit multiplication, and 16-bit swizzles (which we get for
1241 free). We can't use immediate operands in the multiplies,
1242 because immediates are permitted only in src1 and the 16-bit
1243 factor is permitted only in src0. */
1244 for( i = 0; i < 4; i++ )
1245 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1246 for( i = 0; i < 4; i++ )
1247 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1248 high_words( itmp[ i ] ) );
1249 for( i = 0; i < 4; i++ )
1250 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1251 for( i = 0; i < 4; i++ )
1252 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1253 high_words( itmp[ i ] ) );
1254 for( i = 0; i < 4; i++ )
1255 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1256 for( i = 0; i < 4; i++ )
1257 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1258 high_words( itmp[ i ] ) );
1259
1260 /* Now we want to initialise the four gradients based on the
1261 hashes. Format conversion from signed integer to float leaves
1262 everything scaled too high by a factor of pow( 2, 15 ), but
1263 we correct for that right at the end. */
1264 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1265 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1266 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1267 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1268 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1269
1270 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1271 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1272 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1273 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1274
1275 brw_MUL( p, x1y0, x1y0, t );
1276 brw_MUL( p, x1y1, x1y1, t );
1277 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1278 brw_MUL( p, x0y0, x0y0, param0 );
1279 brw_MUL( p, x0y1, x0y1, param0 );
1280
1281 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1282 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1283 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1284 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1285
1286 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1287 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1288 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1289 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1290
1291 /* We interpolate between the gradients using the polynomial
1292 6t^5 - 15t^4 + 10t^3 (Perlin). */
1293 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1294 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1295 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1296 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1297 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1298 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1299 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1300 pipeline */
1301 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1302 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1303 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1304 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1305 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1306 pipeline */
1307 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1308 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1309 brw_MUL( p, param0, tmp[ 0 ], param0 );
1310 brw_MUL( p, param1, tmp[ 1 ], param1 );
1311
1312 /* Here we interpolate in the y dimension... */
1313 brw_MUL( p, x0y1, x0y1, param1 );
1314 brw_MUL( p, x1y1, x1y1, param1 );
1315 brw_ADD( p, x0y0, x0y0, x0y1 );
1316 brw_ADD( p, x1y0, x1y0, x1y1 );
1317
1318 /* And now in x. There are horrible register dependencies here,
1319 but we have nothing else to do. */
1320 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1321 brw_MUL( p, x1y0, x1y0, param0 );
1322 brw_ADD( p, x0y0, x0y0, x1y0 );
1323
1324 /* scale by pow( 2, -15 ), as described above */
1325 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1326
1327 release_tmps( c, mark );
1328 }
1329
1330 static void emit_noise2( struct brw_wm_compile *c,
1331 struct prog_instruction *inst )
1332 {
1333 struct brw_compile *p = &c->func;
1334 struct brw_reg src0, src1, param0, param1, dst;
1335 GLuint mask = inst->DstReg.WriteMask;
1336 int i;
1337 int mark = mark_tmps( c );
1338
1339 assert( mark == 0 );
1340
1341 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1342 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1343
1344 param0 = alloc_tmp( c );
1345 param1 = alloc_tmp( c );
1346
1347 brw_MOV( p, param0, src0 );
1348 brw_MOV( p, param1, src1 );
1349
1350 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1351
1352 /* Fill in the result: */
1353 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1354 for (i = 0 ; i < 4; i++) {
1355 if (mask & (1<<i)) {
1356 dst = get_dst_reg(c, inst, i, 1);
1357 brw_MOV( p, dst, param0 );
1358 }
1359 }
1360 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1361 brw_set_saturate( p, 0 );
1362
1363 release_tmps( c, mark );
1364 }
1365
1366 /* The three-dimensional case is much like the one- and two- versions above,
1367 but since the number of corners is rapidly growing we now pack 16 16-bit
1368 hashes into each register to extract more parallelism from the EUs. */
1369 static void noise3_sub( struct brw_wm_compile *c ) {
1370
1371 struct brw_compile *p = &c->func;
1372 struct brw_reg param0, param1, param2,
1373 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1374 xi, yi, zi, /* interpolation coefficients */
1375 t, tmp[ 8 ], /* float temporaries */
1376 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1377 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1378 int i;
1379 int mark = mark_tmps( c );
1380
1381 x0y0 = alloc_tmp( c );
1382 x0y1 = alloc_tmp( c );
1383 x1y0 = alloc_tmp( c );
1384 x1y1 = alloc_tmp( c );
1385 xi = alloc_tmp( c );
1386 yi = alloc_tmp( c );
1387 zi = alloc_tmp( c );
1388 t = alloc_tmp( c );
1389 for( i = 0; i < 8; i++ ) {
1390 tmp[ i ] = alloc_tmp( c );
1391 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1392 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1393 }
1394
1395 param0 = lookup_tmp( c, mark - 4 );
1396 param1 = lookup_tmp( c, mark - 3 );
1397 param2 = lookup_tmp( c, mark - 2 );
1398
1399 brw_set_access_mode( p, BRW_ALIGN_1 );
1400
1401 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1402 be hashed. Also compute the remainders (offsets within the unit
1403 cube), interleaved to reduce register dependency penalties. */
1404 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1405 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1406 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1407 brw_FRC( p, param0, param0 );
1408 brw_FRC( p, param1, param1 );
1409 brw_FRC( p, param2, param2 );
1410 /* Since we now have only 16 bits of precision in the hash, we must
1411 be more careful about thorough mixing to maintain entropy as we
1412 squash the input vector into a small scalar. */
1413 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1414 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1415 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1416 brw_imm_uw( 0x9B93 ) );
1417 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1418 brw_imm_uw( 0xBC8F ) );
1419
1420 /* Temporarily disable the execution mask while we work with ExecSize=16
1421 channels (the mask is set for ExecSize=8 and is probably incorrect).
1422 Although this might cause execution of unwanted channels, the code
1423 writes only to temporary registers and has no side effects, so
1424 disabling the mask is harmless. */
1425 brw_push_insn_state( p );
1426 brw_set_mask_control( p, BRW_MASK_DISABLE );
1427 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1428 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1429 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1430
1431 /* We're now ready to perform the hashing. The eight hashes are
1432 interleaved for performance. The hash function used is
1433 designed to rapidly achieve avalanche and require only 16x16
1434 bit multiplication, and 8-bit swizzles (which we get for
1435 free). */
1436 for( i = 0; i < 4; i++ )
1437 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1438 for( i = 0; i < 4; i++ )
1439 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1440 odd_bytes( wtmp[ i ] ) );
1441 for( i = 0; i < 4; i++ )
1442 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1443 for( i = 0; i < 4; i++ )
1444 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1445 odd_bytes( wtmp[ i ] ) );
1446 brw_pop_insn_state( p );
1447
1448 /* Now we want to initialise the four rear gradients based on the
1449 hashes. Format conversion from signed integer to float leaves
1450 everything scaled too high by a factor of pow( 2, 15 ), but
1451 we correct for that right at the end. */
1452 /* x component */
1453 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1454 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1455 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1456 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1457 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1458
1459 brw_push_insn_state( p );
1460 brw_set_mask_control( p, BRW_MASK_DISABLE );
1461 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1462 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1463 brw_pop_insn_state( p );
1464
1465 brw_MUL( p, x1y0, x1y0, t );
1466 brw_MUL( p, x1y1, x1y1, t );
1467 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1468 brw_MUL( p, x0y0, x0y0, param0 );
1469 brw_MUL( p, x0y1, x0y1, param0 );
1470
1471 /* y component */
1472 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1473 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1474 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1475 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1476
1477 brw_push_insn_state( p );
1478 brw_set_mask_control( p, BRW_MASK_DISABLE );
1479 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1480 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1481 brw_pop_insn_state( p );
1482
1483 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1484 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1485 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1486 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1487 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1488
1489 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1490 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1491 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1492 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1493
1494 /* z component */
1495 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1496 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1497 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1498 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1499
1500 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1501 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1502 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1503 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1504
1505 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1506 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1507 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1508 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1509
1510 /* We interpolate between the gradients using the polynomial
1511 6t^5 - 15t^4 + 10t^3 (Perlin). */
1512 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1513 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1514 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1515 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1516 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1517 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1518 brw_MUL( p, xi, xi, param0 );
1519 brw_MUL( p, yi, yi, param1 );
1520 brw_MUL( p, zi, zi, param2 );
1521 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1522 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1523 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1524 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1525 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1526 brw_MUL( p, xi, xi, param0 );
1527 brw_MUL( p, yi, yi, param1 );
1528 brw_MUL( p, zi, zi, param2 );
1529 brw_MUL( p, xi, xi, param0 );
1530 brw_MUL( p, yi, yi, param1 );
1531 brw_MUL( p, zi, zi, param2 );
1532 brw_MUL( p, xi, xi, param0 );
1533 brw_MUL( p, yi, yi, param1 );
1534 brw_MUL( p, zi, zi, param2 );
1535
1536 /* Here we interpolate in the y dimension... */
1537 brw_MUL( p, x0y1, x0y1, yi );
1538 brw_MUL( p, x1y1, x1y1, yi );
1539 brw_ADD( p, x0y0, x0y0, x0y1 );
1540 brw_ADD( p, x1y0, x1y0, x1y1 );
1541
1542 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1543 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1544 brw_MUL( p, x1y0, x1y0, xi );
1545 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1546
1547 /* Now do the same thing for the front four gradients... */
1548 /* x component */
1549 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1550 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1551 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1552 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1553
1554 brw_push_insn_state( p );
1555 brw_set_mask_control( p, BRW_MASK_DISABLE );
1556 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1557 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1558 brw_pop_insn_state( p );
1559
1560 brw_MUL( p, x1y0, x1y0, t );
1561 brw_MUL( p, x1y1, x1y1, t );
1562 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1563 brw_MUL( p, x0y0, x0y0, param0 );
1564 brw_MUL( p, x0y1, x0y1, param0 );
1565
1566 /* y component */
1567 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1568 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1569 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1570 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1571
1572 brw_push_insn_state( p );
1573 brw_set_mask_control( p, BRW_MASK_DISABLE );
1574 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1575 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1576 brw_pop_insn_state( p );
1577
1578 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1579 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1580 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1581 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1582 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1583
1584 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1585 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1586 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1587 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1588
1589 /* z component */
1590 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1591 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1592 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1593 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1594
1595 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1596 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1597 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1598 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1599
1600 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1601 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1602 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1603 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1604
1605 /* The interpolation coefficients are still around from last time, so
1606 again interpolate in the y dimension... */
1607 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1608 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1609 brw_MUL( p, x0y1, x0y1, yi );
1610 brw_MUL( p, x1y1, x1y1, yi );
1611 brw_ADD( p, x0y0, x0y0, x0y1 );
1612 brw_ADD( p, x1y0, x1y0, x1y1 );
1613
1614 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1615 time put the front face in tmp[ 1 ] and we're nearly there... */
1616 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1617 brw_MUL( p, x1y0, x1y0, xi );
1618 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1619
1620 /* The final interpolation, in the z dimension: */
1621 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1622 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1623 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1624
1625 /* scale by pow( 2, -15 ), as described above */
1626 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1627
1628 release_tmps( c, mark );
1629 }
1630
1631 static void emit_noise3( struct brw_wm_compile *c,
1632 struct prog_instruction *inst )
1633 {
1634 struct brw_compile *p = &c->func;
1635 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1636 GLuint mask = inst->DstReg.WriteMask;
1637 int i;
1638 int mark = mark_tmps( c );
1639
1640 assert( mark == 0 );
1641
1642 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1643 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1644 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1645
1646 param0 = alloc_tmp( c );
1647 param1 = alloc_tmp( c );
1648 param2 = alloc_tmp( c );
1649
1650 brw_MOV( p, param0, src0 );
1651 brw_MOV( p, param1, src1 );
1652 brw_MOV( p, param2, src2 );
1653
1654 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1655
1656 /* Fill in the result: */
1657 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1658 for (i = 0 ; i < 4; i++) {
1659 if (mask & (1<<i)) {
1660 dst = get_dst_reg(c, inst, i, 1);
1661 brw_MOV( p, dst, param0 );
1662 }
1663 }
1664 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1665 brw_set_saturate( p, 0 );
1666
1667 release_tmps( c, mark );
1668 }
1669
1670 /* For the four-dimensional case, the little micro-optimisation benefits
1671 we obtain by unrolling all the loops aren't worth the massive bloat it
1672 now causes. Instead, we loop twice around performing a similar operation
1673 to noise3, once for the w=0 cube and once for the w=1, with a bit more
1674 code to glue it all together. */
1675 static void noise4_sub( struct brw_wm_compile *c )
1676 {
1677 struct brw_compile *p = &c->func;
1678 struct brw_reg param[ 4 ],
1679 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1680 w0, /* noise for the w=0 cube */
1681 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1682 interp[ 4 ], /* interpolation coefficients */
1683 t, tmp[ 8 ], /* float temporaries */
1684 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1685 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1686 int i, j;
1687 int mark = mark_tmps( c );
1688 GLuint loop, origin;
1689
1690 x0y0 = alloc_tmp( c );
1691 x0y1 = alloc_tmp( c );
1692 x1y0 = alloc_tmp( c );
1693 x1y1 = alloc_tmp( c );
1694 t = alloc_tmp( c );
1695 w0 = alloc_tmp( c );
1696 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1697 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1698
1699 for( i = 0; i < 4; i++ ) {
1700 param[ i ] = lookup_tmp( c, mark - 5 + i );
1701 interp[ i ] = alloc_tmp( c );
1702 }
1703
1704 for( i = 0; i < 8; i++ ) {
1705 tmp[ i ] = alloc_tmp( c );
1706 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1707 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1708 }
1709
1710 brw_set_access_mode( p, BRW_ALIGN_1 );
1711
1712 /* We only want 16 bits of precision from the integral part of each
1713 co-ordinate, but unfortunately the RNDD semantics would saturate
1714 at 16 bits if we performed the operation directly to a 16-bit
1715 destination. Therefore, we round to 32-bit temporaries where
1716 appropriate, and then store only the lower 16 bits. */
1717 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1718 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1719 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1720 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1721 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1722 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1723
1724 /* Modify the flag register here, because the side effect is useful
1725 later (see below). We know for certain that all flags will be
1726 cleared, since the FRC instruction cannot possibly generate
1727 negative results. Even for exceptional inputs (infinities, denormals,
1728 NaNs), the architecture guarantees that the L conditional is false. */
1729 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1730 brw_FRC( p, param[ 0 ], param[ 0 ] );
1731 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1732 for( i = 1; i < 4; i++ )
1733 brw_FRC( p, param[ i ], param[ i ] );
1734
1735 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1736 of all. */
1737 for( i = 0; i < 4; i++ )
1738 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1739 for( i = 0; i < 4; i++ )
1740 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1741 for( i = 0; i < 4; i++ )
1742 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1743 for( i = 0; i < 4; i++ )
1744 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1745 for( j = 0; j < 3; j++ )
1746 for( i = 0; i < 4; i++ )
1747 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1748
1749 /* Mark the current address, as it will be a jump destination. The
1750 following code will be executed twice: first, with the flag
1751 register clear indicating the w=0 case, and second with flags
1752 set for w=1. */
1753 loop = p->nr_insn;
1754
1755 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1756 be hashed. Since we have only 16 bits of precision in the hash, we
1757 must be careful about thorough mixing to maintain entropy as we
1758 squash the input vector into a small scalar. */
1759 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1760 brw_imm_uw( 0xBC8F ) );
1761 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1762 brw_imm_uw( 0xD0BD ) );
1763 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1764 brw_imm_uw( 0x9B93 ) );
1765 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1766 brw_imm_uw( 0xA359 ) );
1767 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1768 brw_imm_uw( 0xBC8F ) );
1769
1770 /* Temporarily disable the execution mask while we work with ExecSize=16
1771 channels (the mask is set for ExecSize=8 and is probably incorrect).
1772 Although this might cause execution of unwanted channels, the code
1773 writes only to temporary registers and has no side effects, so
1774 disabling the mask is harmless. */
1775 brw_push_insn_state( p );
1776 brw_set_mask_control( p, BRW_MASK_DISABLE );
1777 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1778 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1779 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1780
1781 /* We're now ready to perform the hashing. The eight hashes are
1782 interleaved for performance. The hash function used is
1783 designed to rapidly achieve avalanche and require only 16x16
1784 bit multiplication, and 8-bit swizzles (which we get for
1785 free). */
1786 for( i = 0; i < 4; i++ )
1787 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1788 for( i = 0; i < 4; i++ )
1789 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1790 odd_bytes( wtmp[ i ] ) );
1791 for( i = 0; i < 4; i++ )
1792 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1793 for( i = 0; i < 4; i++ )
1794 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1795 odd_bytes( wtmp[ i ] ) );
1796 brw_pop_insn_state( p );
1797
1798 /* Now we want to initialise the four rear gradients based on the
1799 hashes. Format conversion from signed integer to float leaves
1800 everything scaled too high by a factor of pow( 2, 15 ), but
1801 we correct for that right at the end. */
1802 /* x component */
1803 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1804 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1805 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1806 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1807 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1808
1809 brw_push_insn_state( p );
1810 brw_set_mask_control( p, BRW_MASK_DISABLE );
1811 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1812 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1813 brw_pop_insn_state( p );
1814
1815 brw_MUL( p, x1y0, x1y0, t );
1816 brw_MUL( p, x1y1, x1y1, t );
1817 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1818 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1819 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1820
1821 /* y component */
1822 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1823 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1824 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1825 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1826
1827 brw_push_insn_state( p );
1828 brw_set_mask_control( p, BRW_MASK_DISABLE );
1829 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1830 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1831 brw_pop_insn_state( p );
1832
1833 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1834 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1835 /* prepare t for the w component (used below): w the first time through
1836 the loop; w - 1 the second time) */
1837 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1838 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1839 p->current->header.predicate_inverse = 1;
1840 brw_MOV( p, t, param[ 3 ] );
1841 p->current->header.predicate_inverse = 0;
1842 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1843 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1844 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1845
1846 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1847 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1848 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1849 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1850
1851 /* z component */
1852 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1853 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1854 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1855 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1856
1857 brw_push_insn_state( p );
1858 brw_set_mask_control( p, BRW_MASK_DISABLE );
1859 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1860 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1861 brw_pop_insn_state( p );
1862
1863 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1864 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1865 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1866 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1867
1868 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1869 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1870 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1871 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1872
1873 /* w component */
1874 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1875 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1876 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1877 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1878
1879 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1880 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1881 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1882 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1883 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1884
1885 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1886 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1887 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1888 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1889
1890 /* Here we interpolate in the y dimension... */
1891 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1892 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1893 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1894 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1895 brw_ADD( p, x0y0, x0y0, x0y1 );
1896 brw_ADD( p, x1y0, x1y0, x1y1 );
1897
1898 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1899 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1900 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1901 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1902
1903 /* Now do the same thing for the front four gradients... */
1904 /* x component */
1905 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1906 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1907 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1908 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1909
1910 brw_push_insn_state( p );
1911 brw_set_mask_control( p, BRW_MASK_DISABLE );
1912 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1913 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1914 brw_pop_insn_state( p );
1915
1916 brw_MUL( p, x1y0, x1y0, t );
1917 brw_MUL( p, x1y1, x1y1, t );
1918 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1919 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1920 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1921
1922 /* y component */
1923 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1924 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1925 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1926 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1927
1928 brw_push_insn_state( p );
1929 brw_set_mask_control( p, BRW_MASK_DISABLE );
1930 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1931 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1932 brw_pop_insn_state( p );
1933
1934 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1935 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1936 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1937 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1938 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1939
1940 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1941 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1942 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1943 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1944
1945 /* z component */
1946 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1947 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1948 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1949 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1950
1951 brw_push_insn_state( p );
1952 brw_set_mask_control( p, BRW_MASK_DISABLE );
1953 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1954 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1955 brw_pop_insn_state( p );
1956
1957 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1958 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1959 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1960 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1961 /* prepare t for the w component (used below): w the first time through
1962 the loop; w - 1 the second time) */
1963 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1964 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1965 p->current->header.predicate_inverse = 1;
1966 brw_MOV( p, t, param[ 3 ] );
1967 p->current->header.predicate_inverse = 0;
1968 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1969
1970 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1971 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1972 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1973 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1974
1975 /* w component */
1976 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1977 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1978 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1979 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1980
1981 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1982 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1983 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1984 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1985
1986 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1987 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1988 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1989 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1990
1991 /* Interpolate in the y dimension: */
1992 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1993 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1994 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1995 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1996 brw_ADD( p, x0y0, x0y0, x0y1 );
1997 brw_ADD( p, x1y0, x1y0, x1y1 );
1998
1999 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2000 time put the front face in tmp[ 1 ] and we're nearly there... */
2001 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2002 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2003 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2004
2005 /* Another interpolation, in the z dimension: */
2006 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2007 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2008 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2009
2010 /* Exit the loop if we've computed both cubes... */
2011 origin = p->nr_insn;
2012 brw_push_insn_state( p );
2013 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2014 brw_set_mask_control( p, BRW_MASK_DISABLE );
2015 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2016 brw_pop_insn_state( p );
2017
2018 /* Save the result for the w=0 case, and increment the w coordinate: */
2019 brw_MOV( p, w0, tmp[ 0 ] );
2020 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2021 brw_imm_uw( 1 ) );
2022
2023 /* Loop around for the other cube. Explicitly set the flag register
2024 (unfortunately we must spend an extra instruction to do this: we
2025 can't rely on a side effect of the previous MOV or ADD because
2026 conditional modifiers which are normally true might be false in
2027 exceptional circumstances, e.g. given a NaN input; the add to
2028 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2029 brw_push_insn_state( p );
2030 brw_set_mask_control( p, BRW_MASK_DISABLE );
2031 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2032 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2033 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2034 brw_pop_insn_state( p );
2035
2036 /* Patch the previous conditional branch now that we know the
2037 destination address. */
2038 brw_set_src1( p->store + origin,
2039 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2040
2041 /* The very last interpolation. */
2042 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2043 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2044 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2045
2046 /* scale by pow( 2, -15 ), as described above */
2047 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2048
2049 release_tmps( c, mark );
2050 }
2051
2052 static void emit_noise4( struct brw_wm_compile *c,
2053 struct prog_instruction *inst )
2054 {
2055 struct brw_compile *p = &c->func;
2056 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2057 GLuint mask = inst->DstReg.WriteMask;
2058 int i;
2059 int mark = mark_tmps( c );
2060
2061 assert( mark == 0 );
2062
2063 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2064 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2065 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2066 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2067
2068 param0 = alloc_tmp( c );
2069 param1 = alloc_tmp( c );
2070 param2 = alloc_tmp( c );
2071 param3 = alloc_tmp( c );
2072
2073 brw_MOV( p, param0, src0 );
2074 brw_MOV( p, param1, src1 );
2075 brw_MOV( p, param2, src2 );
2076 brw_MOV( p, param3, src3 );
2077
2078 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2079
2080 /* Fill in the result: */
2081 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2082 for (i = 0 ; i < 4; i++) {
2083 if (mask & (1<<i)) {
2084 dst = get_dst_reg(c, inst, i, 1);
2085 brw_MOV( p, dst, param0 );
2086 }
2087 }
2088 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2089 brw_set_saturate( p, 0 );
2090
2091 release_tmps( c, mark );
2092 }
2093
2094 static void emit_wpos_xy(struct brw_wm_compile *c,
2095 struct prog_instruction *inst)
2096 {
2097 struct brw_compile *p = &c->func;
2098 GLuint mask = inst->DstReg.WriteMask;
2099 struct brw_reg src0[2], dst[2];
2100
2101 dst[0] = get_dst_reg(c, inst, 0, 1);
2102 dst[1] = get_dst_reg(c, inst, 1, 1);
2103
2104 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2105 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2106
2107 /* Calculate the pixel offset from window bottom left into destination
2108 * X and Y channels.
2109 */
2110 if (mask & WRITEMASK_X) {
2111 /* X' = X - origin_x */
2112 brw_ADD(p,
2113 dst[0],
2114 retype(src0[0], BRW_REGISTER_TYPE_W),
2115 brw_imm_d(0 - c->key.origin_x));
2116 }
2117
2118 if (mask & WRITEMASK_Y) {
2119 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2120 brw_ADD(p,
2121 dst[1],
2122 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2123 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2124 }
2125 }
2126
2127 /* TODO
2128 BIAS on SIMD8 not workind yet...
2129 */
2130 static void emit_txb(struct brw_wm_compile *c,
2131 struct prog_instruction *inst)
2132 {
2133 struct brw_compile *p = &c->func;
2134 struct brw_reg dst[4], src[4], payload_reg;
2135 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2136
2137 GLuint i;
2138 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2139 for (i = 0; i < 4; i++)
2140 dst[i] = get_dst_reg(c, inst, i, 1);
2141 for (i = 0; i < 4; i++)
2142 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2143
2144 switch (inst->TexSrcTarget) {
2145 case TEXTURE_1D_INDEX:
2146 brw_MOV(p, brw_message_reg(2), src[0]);
2147 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2148 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2149 break;
2150 case TEXTURE_2D_INDEX:
2151 case TEXTURE_RECT_INDEX:
2152 brw_MOV(p, brw_message_reg(2), src[0]);
2153 brw_MOV(p, brw_message_reg(3), src[1]);
2154 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2155 break;
2156 default:
2157 brw_MOV(p, brw_message_reg(2), src[0]);
2158 brw_MOV(p, brw_message_reg(3), src[1]);
2159 brw_MOV(p, brw_message_reg(4), src[2]);
2160 break;
2161 }
2162 brw_MOV(p, brw_message_reg(5), src[3]);
2163 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2164 brw_SAMPLE(p,
2165 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2166 1,
2167 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2168 unit + MAX_DRAW_BUFFERS, /* surface */
2169 unit, /* sampler */
2170 inst->DstReg.WriteMask,
2171 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2172 4,
2173 4,
2174 0);
2175 }
2176
2177 static void emit_tex(struct brw_wm_compile *c,
2178 struct prog_instruction *inst)
2179 {
2180 struct brw_compile *p = &c->func;
2181 struct brw_reg dst[4], src[4], payload_reg;
2182 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2183
2184 GLuint msg_len;
2185 GLuint i, nr;
2186 GLuint emit;
2187 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2188
2189 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2190
2191 for (i = 0; i < 4; i++)
2192 dst[i] = get_dst_reg(c, inst, i, 1);
2193 for (i = 0; i < 4; i++)
2194 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2195
2196
2197 switch (inst->TexSrcTarget) {
2198 case TEXTURE_1D_INDEX:
2199 emit = WRITEMASK_X;
2200 nr = 1;
2201 break;
2202 case TEXTURE_2D_INDEX:
2203 case TEXTURE_RECT_INDEX:
2204 emit = WRITEMASK_XY;
2205 nr = 2;
2206 break;
2207 default:
2208 emit = WRITEMASK_XYZ;
2209 nr = 3;
2210 break;
2211 }
2212 msg_len = 1;
2213
2214 for (i = 0; i < nr; i++) {
2215 static const GLuint swz[4] = {0,1,2,2};
2216 if (emit & (1<<i))
2217 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2218 else
2219 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2220 msg_len += 1;
2221 }
2222
2223 if (shadow) {
2224 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2225 brw_MOV(p, brw_message_reg(6), src[2]);
2226 }
2227
2228 brw_SAMPLE(p,
2229 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2230 1,
2231 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2232 unit + MAX_DRAW_BUFFERS, /* surface */
2233 unit, /* sampler */
2234 inst->DstReg.WriteMask,
2235 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2236 4,
2237 shadow ? 6 : 4,
2238 0);
2239
2240 if (shadow)
2241 brw_MOV(p, dst[3], brw_imm_f(1.0));
2242 }
2243
2244 static void post_wm_emit( struct brw_wm_compile *c )
2245 {
2246 GLuint nr_insns = c->fp->program.Base.NumInstructions;
2247 GLuint insn, target_insn;
2248 struct prog_instruction *inst1, *inst2;
2249 struct brw_instruction *brw_inst1, *brw_inst2;
2250 int offset;
2251 for (insn = 0; insn < nr_insns; insn++) {
2252 inst1 = &c->fp->program.Base.Instructions[insn];
2253 brw_inst1 = inst1->Data;
2254 switch (inst1->Opcode) {
2255 case OPCODE_CAL:
2256 target_insn = inst1->BranchTarget;
2257 inst2 = &c->fp->program.Base.Instructions[target_insn];
2258 brw_inst2 = inst2->Data;
2259 offset = brw_inst2 - brw_inst1;
2260 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
2261 break;
2262 default:
2263 break;
2264 }
2265 }
2266 }
2267
2268 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2269 {
2270 #define MAX_IFSN 32
2271 #define MAX_LOOP_DEPTH 32
2272 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2273 struct brw_instruction *inst0, *inst1;
2274 int i, if_insn = 0, loop_insn = 0;
2275 struct brw_compile *p = &c->func;
2276 struct brw_indirect stack_index = brw_indirect(0, 0);
2277
2278 c->reg_index = 0;
2279 prealloc_reg(c);
2280 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2281 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2282
2283 for (i = 0; i < c->nr_fp_insns; i++) {
2284 struct prog_instruction *inst = &c->prog_instructions[i];
2285 struct prog_instruction *orig_inst;
2286
2287 if ((orig_inst = inst->Data) != 0)
2288 orig_inst->Data = current_insn(p);
2289
2290 if (inst->CondUpdate)
2291 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2292 else
2293 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2294
2295 switch (inst->Opcode) {
2296 case WM_PIXELXY:
2297 emit_pixel_xy(c, inst);
2298 break;
2299 case WM_DELTAXY:
2300 emit_delta_xy(c, inst);
2301 break;
2302 case WM_PIXELW:
2303 emit_pixel_w(c, inst);
2304 break;
2305 case WM_LINTERP:
2306 emit_linterp(c, inst);
2307 break;
2308 case WM_PINTERP:
2309 emit_pinterp(c, inst);
2310 break;
2311 case WM_CINTERP:
2312 emit_cinterp(c, inst);
2313 break;
2314 case WM_WPOSXY:
2315 emit_wpos_xy(c, inst);
2316 break;
2317 case WM_FB_WRITE:
2318 emit_fb_write(c, inst);
2319 break;
2320 case OPCODE_ABS:
2321 emit_abs(c, inst);
2322 break;
2323 case OPCODE_ADD:
2324 emit_add(c, inst);
2325 break;
2326 case OPCODE_SUB:
2327 emit_sub(c, inst);
2328 break;
2329 case OPCODE_FRC:
2330 emit_frc(c, inst);
2331 break;
2332 case OPCODE_FLR:
2333 emit_flr(c, inst);
2334 break;
2335 case OPCODE_LRP:
2336 emit_lrp(c, inst);
2337 break;
2338 case OPCODE_TRUNC:
2339 emit_trunc(c, inst);
2340 break;
2341 case OPCODE_MOV:
2342 emit_mov(c, inst);
2343 break;
2344 case OPCODE_DP3:
2345 emit_dp3(c, inst);
2346 break;
2347 case OPCODE_DP4:
2348 emit_dp4(c, inst);
2349 break;
2350 case OPCODE_XPD:
2351 emit_xpd(c, inst);
2352 break;
2353 case OPCODE_DPH:
2354 emit_dph(c, inst);
2355 break;
2356 case OPCODE_RCP:
2357 emit_rcp(c, inst);
2358 break;
2359 case OPCODE_RSQ:
2360 emit_rsq(c, inst);
2361 break;
2362 case OPCODE_SIN:
2363 emit_sin(c, inst);
2364 break;
2365 case OPCODE_COS:
2366 emit_cos(c, inst);
2367 break;
2368 case OPCODE_EX2:
2369 emit_ex2(c, inst);
2370 break;
2371 case OPCODE_LG2:
2372 emit_lg2(c, inst);
2373 break;
2374 case OPCODE_MAX:
2375 emit_max(c, inst);
2376 break;
2377 case OPCODE_MIN:
2378 emit_min(c, inst);
2379 break;
2380 case OPCODE_DDX:
2381 emit_ddx(c, inst);
2382 break;
2383 case OPCODE_DDY:
2384 emit_ddy(c, inst);
2385 break;
2386 case OPCODE_SLT:
2387 emit_slt(c, inst);
2388 break;
2389 case OPCODE_SLE:
2390 emit_sle(c, inst);
2391 break;
2392 case OPCODE_SGT:
2393 emit_sgt(c, inst);
2394 break;
2395 case OPCODE_SGE:
2396 emit_sge(c, inst);
2397 break;
2398 case OPCODE_SEQ:
2399 emit_seq(c, inst);
2400 break;
2401 case OPCODE_SNE:
2402 emit_sne(c, inst);
2403 break;
2404 case OPCODE_MUL:
2405 emit_mul(c, inst);
2406 break;
2407 case OPCODE_POW:
2408 emit_pow(c, inst);
2409 break;
2410 case OPCODE_MAD:
2411 emit_mad(c, inst);
2412 break;
2413 case OPCODE_NOISE1:
2414 emit_noise1(c, inst);
2415 break;
2416 case OPCODE_NOISE2:
2417 emit_noise2(c, inst);
2418 break;
2419 case OPCODE_NOISE3:
2420 emit_noise3(c, inst);
2421 break;
2422 case OPCODE_NOISE4:
2423 emit_noise4(c, inst);
2424 break;
2425 case OPCODE_TEX:
2426 emit_tex(c, inst);
2427 break;
2428 case OPCODE_TXB:
2429 emit_txb(c, inst);
2430 break;
2431 case OPCODE_KIL_NV:
2432 emit_kil(c);
2433 break;
2434 case OPCODE_IF:
2435 assert(if_insn < MAX_IFSN);
2436 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2437 break;
2438 case OPCODE_ELSE:
2439 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2440 break;
2441 case OPCODE_ENDIF:
2442 assert(if_insn > 0);
2443 brw_ENDIF(p, if_inst[--if_insn]);
2444 break;
2445 case OPCODE_BGNSUB:
2446 case OPCODE_ENDSUB:
2447 break;
2448 case OPCODE_CAL:
2449 brw_push_insn_state(p);
2450 brw_set_mask_control(p, BRW_MASK_DISABLE);
2451 brw_set_access_mode(p, BRW_ALIGN_1);
2452 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2453 brw_set_access_mode(p, BRW_ALIGN_16);
2454 brw_ADD(p, get_addr_reg(stack_index),
2455 get_addr_reg(stack_index), brw_imm_d(4));
2456 orig_inst = inst->Data;
2457 orig_inst->Data = &p->store[p->nr_insn];
2458 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2459 brw_pop_insn_state(p);
2460 break;
2461
2462 case OPCODE_RET:
2463 brw_push_insn_state(p);
2464 brw_set_mask_control(p, BRW_MASK_DISABLE);
2465 brw_ADD(p, get_addr_reg(stack_index),
2466 get_addr_reg(stack_index), brw_imm_d(-4));
2467 brw_set_access_mode(p, BRW_ALIGN_1);
2468 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2469 brw_set_access_mode(p, BRW_ALIGN_16);
2470 brw_pop_insn_state(p);
2471
2472 break;
2473 case OPCODE_BGNLOOP:
2474 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2475 break;
2476 case OPCODE_BRK:
2477 brw_BREAK(p);
2478 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2479 break;
2480 case OPCODE_CONT:
2481 brw_CONT(p);
2482 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2483 break;
2484 case OPCODE_ENDLOOP:
2485 loop_insn--;
2486 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2487 /* patch all the BREAK instructions from
2488 last BEGINLOOP */
2489 while (inst0 > loop_inst[loop_insn]) {
2490 inst0--;
2491 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2492 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2493 inst0->bits3.if_else.pop_count = 0;
2494 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2495 inst0->bits3.if_else.jump_count = inst1 - inst0;
2496 inst0->bits3.if_else.pop_count = 0;
2497 }
2498 }
2499 break;
2500 default:
2501 _mesa_printf("unsupported IR in fragment shader %d\n",
2502 inst->Opcode);
2503 }
2504 if (inst->CondUpdate)
2505 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2506 else
2507 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2508 }
2509 post_wm_emit(c);
2510 for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
2511 c->fp->program.Base.Instructions[i].Data = NULL;
2512 }
2513
2514 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2515 {
2516 brw_wm_pass_fp(c);
2517 brw_wm_emit_glsl(brw, c);
2518 c->prog_data.total_grf = c->reg_index;
2519 c->prog_data.total_scratch = 0;
2520 }