#include "brw_eu.h"
#include "brw_wm.h"
+enum _subroutine {
+ SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
+};
+
/* Only guess, need a flag in gl_fragment_program later */
GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
{
case OPCODE_RET:
case OPCODE_DDX:
case OPCODE_DDY:
+ case OPCODE_NOISE1:
+ case OPCODE_NOISE2:
+ case OPCODE_NOISE3:
+ case OPCODE_NOISE4:
case OPCODE_BGNLOOP:
return GL_TRUE;
default:
return reg;
}
-static void release_tmps(struct brw_wm_compile *c)
+static int mark_tmps(struct brw_wm_compile *c)
+{
+ return c->tmp_index;
+}
+
+static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
+{
+ return brw_vec8_grf( c->tmp_regs[ index ], 0 );
+}
+
+static void release_tmps(struct brw_wm_compile *c, int mark)
{
- c->tmp_index = 0;
+ c->tmp_index = mark;
}
static struct brw_reg
src->NegateBase, src->Abs);
}
+/* Subroutines are minimal support for resusable instruction sequences.
+ They are implemented as simply as possible to minimise overhead: there
+ is no explicit support for communication between the caller and callee
+ other than saving the return address in a temporary register, nor is
+ there any automatic local storage. This implies that great care is
+ required before attempting reentrancy or any kind of nested
+ subroutine invocations. */
+static void invoke_subroutine( struct brw_wm_compile *c,
+ enum _subroutine subroutine,
+ void (*emit)( struct brw_wm_compile * ) )
+{
+ struct brw_compile *p = &c->func;
+
+ assert( subroutine < BRW_WM_MAX_SUBROUTINE );
+
+ if( c->subroutines[ subroutine ] ) {
+ /* subroutine previously emitted: reuse existing instructions */
+
+ int mark = mark_tmps( c );
+ struct brw_reg return_address = retype( alloc_tmp( c ),
+ BRW_REGISTER_TYPE_UD );
+ int here = p->nr_insn;
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
+
+ brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+ brw_imm_d( ( c->subroutines[ subroutine ] -
+ here - 1 ) << 4 ) );
+ brw_pop_insn_state(p);
+
+ release_tmps( c, mark );
+ } else {
+ /* previously unused subroutine: emit, and mark for later reuse */
+
+ int mark = mark_tmps( c );
+ struct brw_reg return_address = retype( alloc_tmp( c ),
+ BRW_REGISTER_TYPE_UD );
+ struct brw_instruction *calc;
+ int base = p->nr_insn;
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
+ brw_pop_insn_state(p);
+
+ c->subroutines[ subroutine ] = p->nr_insn;
+
+ emit( c );
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV( p, brw_ip_reg(), return_address );
+ brw_pop_insn_state(p);
+
+ brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
+
+ release_tmps( c, mark );
+ }
+}
+
static void emit_abs( struct brw_wm_compile *c,
struct prog_instruction *inst)
{
GLuint mask = inst->DstReg.WriteMask;
struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
int i;
+ int mark = mark_tmps(c);
for (i = 0; i < 4; i++) {
if (mask & (1<<i)) {
dst = get_dst_reg(c, inst, i, 1);
brw_MAC(p, dst, src0, tmp1);
brw_set_saturate(p, 0);
}
- release_tmps(c);
+ release_tmps(c, mark);
}
}
brw_set_saturate(p, 0);
}
+static __inline struct brw_reg high_words( struct brw_reg reg )
+{
+ return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
+ 0, 8, 2 );
+}
+
+static __inline struct brw_reg low_words( struct brw_reg reg )
+{
+ return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
+}
+
+/* One- and two-dimensional Perlin noise, similar to the description in
+ _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
+static void noise1_sub( struct brw_wm_compile *c ) {
+
+ struct brw_compile *p = &c->func;
+ struct brw_reg param,
+ x0, x1, /* gradients at each end */
+ t, tmp[ 2 ], /* float temporaries */
+ itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
+ int i;
+ int mark = mark_tmps( c );
+
+ x0 = alloc_tmp( c );
+ x1 = alloc_tmp( c );
+ t = alloc_tmp( c );
+ tmp[ 0 ] = alloc_tmp( c );
+ tmp[ 1 ] = alloc_tmp( c );
+ itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
+ itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
+ itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
+ itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
+ itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
+
+ param = lookup_tmp( c, mark - 2 );
+
+ brw_set_access_mode( p, BRW_ALIGN_1 );
+
+ brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+
+ /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
+ be hashed. Also compute the remainder (offset within the unit
+ length), interleaved to reduce register dependency penalties. */
+ brw_RNDD( p, itmp[ 0 ], param );
+ brw_FRC( p, param, param );
+ brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
+ brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+
+ /* We're now ready to perform the hashing. The two hashes are
+ interleaved for performance. The hash function used is
+ designed to rapidly achieve avalanche and require only 32x16
+ bit multiplication, and 16-bit swizzles (which we get for
+ free). We can't use immediate operands in the multiplies,
+ because immediates are permitted only in src1 and the 16-bit
+ factor is permitted only in src0. */
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+
+ /* Now we want to initialise the two gradients based on the
+ hashes. Format conversion from signed integer to float leaves
+ everything scaled too high by a factor of pow( 2, 31 ), but
+ we correct for that right at the end. */
+ brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
+ brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
+ brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
+
+ brw_MUL( p, x0, x0, param );
+ brw_MUL( p, x1, x1, t );
+
+ /* We interpolate between the gradients using the polynomial
+ 6t^5 - 15t^4 + 10t^3 (Perlin). */
+ brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_MUL( p, param, tmp[ 0 ], param );
+ brw_MUL( p, x1, x1, param );
+ brw_ADD( p, x0, x0, x1 );
+ /* scale by pow( 2, -30 ), to compensate for the format conversion
+ above and an extra factor of 2 so that a single gradient covers
+ the [-1,1] range */
+ brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
+
+ release_tmps( c, mark );
+}
+
+static void emit_noise1( struct brw_wm_compile *c,
+ struct prog_instruction *inst )
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src, param, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ int mark = mark_tmps( c );
+
+ assert( mark == 0 );
+
+ src = get_src_reg( c, inst->SrcReg, 0, 1 );
+
+ param = alloc_tmp( c );
+
+ brw_MOV( p, param, src );
+
+ invoke_subroutine( c, SUB_NOISE1, noise1_sub );
+
+ /* Fill in the result: */
+ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV( p, dst, param );
+ }
+ }
+ if( inst->SaturateMode == SATURATE_ZERO_ONE )
+ brw_set_saturate( p, 0 );
+
+ release_tmps( c, mark );
+}
+
+static void noise2_sub( struct brw_wm_compile *c ) {
+
+ struct brw_compile *p = &c->func;
+ struct brw_reg param0, param1,
+ x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
+ t, tmp[ 4 ], /* float temporaries */
+ itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
+ int i;
+ int mark = mark_tmps( c );
+
+ x0y0 = alloc_tmp( c );
+ x0y1 = alloc_tmp( c );
+ x1y0 = alloc_tmp( c );
+ x1y1 = alloc_tmp( c );
+ t = alloc_tmp( c );
+ for( i = 0; i < 4; i++ ) {
+ tmp[ i ] = alloc_tmp( c );
+ itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+ }
+ itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
+ itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
+ itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
+
+ param0 = lookup_tmp( c, mark - 3 );
+ param1 = lookup_tmp( c, mark - 2 );
+
+ brw_set_access_mode( p, BRW_ALIGN_1 );
+
+ /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
+ be hashed. Also compute the remainders (offsets within the unit
+ square), interleaved to reduce register dependency penalties. */
+ brw_RNDD( p, itmp[ 0 ], param0 );
+ brw_RNDD( p, itmp[ 1 ], param1 );
+ brw_FRC( p, param0, param0 );
+ brw_FRC( p, param1, param1 );
+ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+ brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
+ low_words( itmp[ 1 ] ) );
+ brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+ brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+ brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
+ brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
+ brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
+
+ /* We're now ready to perform the hashing. The four hashes are
+ interleaved for performance. The hash function used is
+ designed to rapidly achieve avalanche and require only 32x16
+ bit multiplication, and 16-bit swizzles (which we get for
+ free). We can't use immediate operands in the multiplies,
+ because immediates are permitted only in src1 and the 16-bit
+ factor is permitted only in src0. */
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+
+ /* Now we want to initialise the four gradients based on the
+ hashes. Format conversion from signed integer to float leaves
+ everything scaled too high by a factor of pow( 2, 15 ), but
+ we correct for that right at the end. */
+ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+ brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+ brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+ brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
+ brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
+
+ brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
+ brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
+ brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
+ brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
+
+ brw_MUL( p, x1y0, x1y0, t );
+ brw_MUL( p, x1y1, x1y1, t );
+ brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+ brw_MUL( p, x0y0, x0y0, param0 );
+ brw_MUL( p, x0y1, x0y1, param0 );
+
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
+ brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
+ brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
+
+ brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
+ brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
+
+ /* We interpolate between the gradients using the polynomial
+ 6t^5 - 15t^4 + 10t^3 (Perlin). */
+ brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
+ brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_MUL( p, param0, tmp[ 0 ], param0 );
+ brw_MUL( p, param1, tmp[ 1 ], param1 );
+
+ /* Here we interpolate in the y dimension... */
+ brw_MUL( p, x0y1, x0y1, param1 );
+ brw_MUL( p, x1y1, x1y1, param1 );
+ brw_ADD( p, x0y0, x0y0, x0y1 );
+ brw_ADD( p, x1y0, x1y0, x1y1 );
+
+ /* And now in x. There are horrible register dependencies here,
+ but we have nothing else to do. */
+ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+ brw_MUL( p, x1y0, x1y0, param0 );
+ brw_ADD( p, x0y0, x0y0, x1y0 );
+
+ /* scale by pow( 2, -15 ), as described above */
+ brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
+
+ release_tmps( c, mark );
+}
+
+static void emit_noise2( struct brw_wm_compile *c,
+ struct prog_instruction *inst )
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, param0, param1, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ int mark = mark_tmps( c );
+
+ assert( mark == 0 );
+
+ src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+ src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+
+ param0 = alloc_tmp( c );
+ param1 = alloc_tmp( c );
+
+ brw_MOV( p, param0, src0 );
+ brw_MOV( p, param1, src1 );
+
+ invoke_subroutine( c, SUB_NOISE2, noise2_sub );
+
+ /* Fill in the result: */
+ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV( p, dst, param0 );
+ }
+ }
+ if( inst->SaturateMode == SATURATE_ZERO_ONE )
+ brw_set_saturate( p, 0 );
+
+ release_tmps( c, mark );
+}
+
static void emit_wpos_xy(struct brw_wm_compile *c,
struct prog_instruction *inst)
{
case OPCODE_MAD:
emit_mad(c, inst);
break;
+ case OPCODE_NOISE1:
+ emit_noise1(c, inst);
+ break;
+ case OPCODE_NOISE2:
+ emit_noise2(c, inst);
+ break;
+ /* case OPCODE_NOISE3: */
+ /* case OPCODE_NOISE4: */
+ /* not yet implemented */
case OPCODE_TEX:
emit_tex(c, inst);
break;