X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fr600%2Fr600_shader.c;h=903a66302632f38161324b112778b8045a287468;hb=22fc5eff803d14e99e9f86db223ceedcef1b9108;hp=a462691f7aa7af0a1744f9a135bc4a5507b6154a;hpb=df155a73f4ba43fcc720c7b70b375ec9ed41bb89;p=mesa.git diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index a462691f7aa..903a6630263 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -197,6 +197,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx, use_sb &= !shader->shader.uses_atomics; use_sb &= !shader->shader.uses_images; + use_sb &= !shader->shader.uses_helper_invocation; /* Check if the bytecode has already been built. */ if (!shader->shader.bc.bytecode) { @@ -346,6 +347,7 @@ struct r600_shader_ctx { boolean clip_vertex_write; unsigned cv_output; unsigned edgeflag_output; + int helper_invoc_reg; int cs_block_size_reg; int cs_grid_size_reg; bool cs_block_size_loaded, cs_grid_size_loaded; @@ -1109,7 +1111,6 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { location = TGSI_INTERPOLATE_LOC_CENTER; - inputs[1].enabled = true; /* needs SAMPLEID */ } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { location = TGSI_INTERPOLATE_LOC_CENTER; /* Needs sample positions, currently those are always available */ @@ -1137,6 +1138,24 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off tgsi_parse_free(&parse); + if (ctx->info.reads_samplemask && + (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) { + inputs[1].enabled = true; + } + + if (ctx->bc->chip_class >= EVERGREEN) { + int num_baryc = 0; + /* assign gpr to each interpolator according to priority */ + for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { + if (ctx->eg_interpolators[i].enabled) { + ctx->eg_interpolators[i].ij_index = num_baryc; + num_baryc++; + } + } + num_baryc = (num_baryc + 1) >> 1; + gpr_offset += num_baryc; + } + for (i = 0; i < ARRAY_SIZE(inputs); i++) { boolean enabled = inputs[i].enabled; int *reg = inputs[i].reg; @@ -1163,18 +1182,21 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off * for evergreen we need to scan the shader to find the number of GPRs we need to * reserve for interpolation and system values * - * we need to know if we are going to emit - * any sample or centroid inputs + * we need to know if we are going to emit any sample or centroid inputs * if perspective and linear are required */ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) { unsigned i; - int num_baryc; - struct tgsi_parse_context parse; memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); + /* + * Could get this information from the shader info. But right now + * we interpolate all declared inputs, whereas the shader info will + * only contain the bits if the inputs are actually used, so it might + * not be safe... + */ for (i = 0; i < ctx->info.num_inputs; i++) { int k; /* skip position/face/mask/sampleid */ @@ -1191,53 +1213,9 @@ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) ctx->eg_interpolators[k].enabled = TRUE; } - if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { - return 0; - } - - /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ - while (!tgsi_parse_end_of_tokens(&parse)) { - tgsi_parse_token(&parse); - - if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { - const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) - { - int interpolate, location, k; - - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { - location = TGSI_INTERPOLATE_LOC_CENTER; - } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { - location = TGSI_INTERPOLATE_LOC_CENTER; - } else { - location = TGSI_INTERPOLATE_LOC_CENTROID; - } - - interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; - k = eg_get_interpolator_index(interpolate, location); - if (k >= 0) - ctx->eg_interpolators[k].enabled = true; - } - } - } - - tgsi_parse_free(&parse); - - /* assign gpr to each interpolator according to priority */ - num_baryc = 0; - for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { - if (ctx->eg_interpolators[i].enabled) { - ctx->eg_interpolators[i].ij_index = num_baryc; - num_baryc ++; - } - } - /* XXX PULL MODEL and LINE STIPPLE */ - num_baryc = (num_baryc + 1) >> 1; - return allocate_system_value_inputs(ctx, num_baryc); + return allocate_system_value_inputs(ctx, 0); } /* sample_id_sel == NULL means fetch for current sample */ @@ -1246,8 +1224,6 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_ struct r600_bytecode_vtx vtx; int r, t1; - assert(ctx->fixed_pt_position_gpr != -1); - t1 = r600_get_temp(ctx); memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); @@ -1255,6 +1231,8 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_ vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; if (sample_id == NULL) { + assert(ctx->fixed_pt_position_gpr != -1); + vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; vtx.src_sel_x = 3; } @@ -1295,6 +1273,75 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_ return t1; } +static int eg_load_helper_invocation(struct r600_shader_ctx *ctx) +{ + int r; + struct r600_bytecode_alu alu; + + /* do a vtx fetch with wqm set on the vtx fetch */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->helper_invoc_reg; + alu.dst.chan = 0; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0xffffffff; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* do a vtx fetch in VPM mode */ + struct r600_bytecode_vtx vtx; + memset(&vtx, 0, sizeof(vtx)); + vtx.op = FETCH_OP_GET_BUFFER_RESINFO; + vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; + vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; + vtx.src_gpr = 0; + vtx.mega_fetch_count = 16; /* no idea here really... */ + vtx.dst_gpr = ctx->helper_invoc_reg; + vtx.dst_sel_x = 4; + vtx.dst_sel_y = 7; /* SEL_Y */ + vtx.dst_sel_z = 7; /* SEL_Z */ + vtx.dst_sel_w = 7; /* SEL_W */ + vtx.data_format = FMT_32; + if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) + return r; + ctx->bc->cf_last->vpm = 1; + return 0; +} + +static int cm_load_helper_invocation(struct r600_shader_ctx *ctx) +{ + int r; + struct r600_bytecode_alu alu; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->helper_invoc_reg; + alu.dst.chan = 0; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0xffffffff; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->helper_invoc_reg; + alu.dst.chan = 0; + alu.src[0].sel = V_SQ_ALU_SRC_0; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE); + if (r) + return r; + + return ctx->helper_invoc_reg; +} + static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) { struct r600_bytecode_vtx vtx; @@ -1458,6 +1505,12 @@ static void tgsi_src(struct r600_shader_ctx *ctx, r600_src->sel = load_block_grid_size(ctx, false); } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { r600_src->sel = load_block_grid_size(ctx, true); + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) { + r600_src->sel = ctx->helper_invoc_reg; + r600_src->swizzle[0] = 0; + r600_src->swizzle[1] = 0; + r600_src->swizzle[2] = 0; + r600_src->swizzle[3] = 0; } } else { if (tgsi_src->Register.Indirect) @@ -2136,7 +2189,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output for (i = 0; i < so->num_outputs; i++) { struct r600_bytecode_output output; - if (stream != -1 && stream != so->output[i].output_buffer) + if (stream != -1 && stream != so->output[i].stream) continue; memset(&output, 0, sizeof(struct r600_bytecode_output)); @@ -3120,6 +3173,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, tgsi_scan_shader(tokens, &ctx.info); shader->indirect_files = ctx.info.indirect_files; + shader->uses_helper_invocation = false; shader->uses_doubles = ctx.info.uses_doubles; shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; shader->nsys_inputs = 0; @@ -3169,7 +3223,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, break; case PIPE_SHADER_COMPUTE: shader->rat_base = 0; - shader->image_size_const_offset = 0; + shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER]; break; default: break; @@ -3193,6 +3247,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.clip_vertex_write = 0; ctx.thread_id_gpr_loaded = false; + ctx.helper_invoc_reg = -1; ctx.cs_block_size_reg = -1; ctx.cs_grid_size_reg = -1; ctx.cs_block_size_loaded = false; @@ -3238,6 +3293,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); else ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); + + for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { + if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) { + ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++; + shader->uses_helper_invocation = true; + } + } } if (ctx.type == PIPE_SHADER_GEOMETRY) { /* FIXME 1 would be enough in some cases (3 or less input vertices) */ @@ -3282,41 +3344,38 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[TGSI_FILE_CONSTANT] = 512; ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; - ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + - ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; - ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1; - ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2; + + int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] + + ctx.info.file_max[TGSI_FILE_TEMPORARY]; + ctx.bc->ar_reg = ++regno; + ctx.bc->index_reg[0] = ++regno; + ctx.bc->index_reg[1] = ++regno; if (ctx.type == PIPE_SHADER_TESS_CTRL) { - ctx.tess_input_info = ctx.bc->ar_reg + 3; - ctx.tess_output_info = ctx.bc->ar_reg + 4; - ctx.temp_reg = ctx.bc->ar_reg + 5; + ctx.tess_input_info = ++regno; + ctx.tess_output_info = ++regno; } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { ctx.tess_input_info = 0; - ctx.tess_output_info = ctx.bc->ar_reg + 3; - ctx.temp_reg = ctx.bc->ar_reg + 4; + ctx.tess_output_info = ++regno; } else if (ctx.type == PIPE_SHADER_GEOMETRY) { - ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3; - ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4; - ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5; - ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6; - ctx.temp_reg = ctx.bc->ar_reg + 7; + ctx.gs_export_gpr_tregs[0] = ++regno; + ctx.gs_export_gpr_tregs[1] = ++regno; + ctx.gs_export_gpr_tregs[2] = ++regno; + ctx.gs_export_gpr_tregs[3] = ++regno; if (ctx.shader->gs_tri_strip_adj_fix) { - ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7; - ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8; - ctx.temp_reg += 2; + ctx.gs_rotated_input[0] = ++regno; + ctx.gs_rotated_input[1] = ++regno; } else { ctx.gs_rotated_input[0] = 0; ctx.gs_rotated_input[1] = 1; } - } else { - ctx.temp_reg = ctx.bc->ar_reg + 3; } if (shader->uses_images) { - ctx.thread_id_gpr = ctx.temp_reg++; + ctx.thread_id_gpr = ++regno; ctx.thread_id_gpr_loaded = false; } + ctx.temp_reg = ++regno; shader->max_arrays = 0; shader->num_arrays = 0; @@ -3439,6 +3498,64 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) shader->nr_ps_max_color_exports = 8; + if (ctx.shader->uses_helper_invocation) { + if (ctx.bc->chip_class == CAYMAN) + r = cm_load_helper_invocation(&ctx); + else + r = eg_load_helper_invocation(&ctx); + if (r) + return r; + } + + /* + * XXX this relies on fixed_pt_position_gpr only being present when + * this shader should be executed per sample. Should be the case for now... + */ + if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { + /* + * Fix up sample mask. The hw always gives us coverage mask for + * the pixel. However, for per-sample shading, we need the + * coverage for the shader invocation only. + * Also, with disabled msaa, only the first bit should be set + * (luckily the same fixup works for both problems). + * For now, we can only do it if we know this shader is always + * executed per sample (due to usage of bits in the shader + * forcing per-sample execution). + * If the fb is not multisampled, we'd do unnecessary work but + * it should still be correct. + * It will however do nothing for sample shading according + * to MinSampleShading. + */ + struct r600_bytecode_alu alu; + int tmp = r600_get_temp(&ctx); + assert(ctx.face_gpr != -1); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + + alu.op = ALU_OP2_LSHL_INT; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0x1; + alu.src[1].sel = ctx.fixed_pt_position_gpr; + alu.src[1].chan = 3; + alu.dst.sel = tmp; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) + return r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_AND_INT; + alu.src[0].sel = tmp; + alu.src[1].sel = ctx.face_gpr; + alu.src[1].chan = 2; + alu.dst.sel = ctx.face_gpr; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) + return r; + } + if (ctx.fragcoord_input >= 0) { if (ctx.bc->chip_class == CAYMAN) { for (j = 0 ; j < 4; j++) { @@ -3780,6 +3897,17 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].array_base = shader->output[i].sid; output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; shader->nr_ps_color_exports++; + shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4)); + + /* If the i-th target format is set, all previous target formats must + * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well. + */ + if (shader->output[i].sid > 0) + for (unsigned x = 0; x < shader->output[i].sid; x++) + shader->ps_color_export_mask |= (1 << (x*4)); + + if (shader->output[i].sid > shader->ps_export_highest) + shader->ps_export_highest = shader->output[i].sid; if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { for (k = 1; k < max_color_exports; k++) { j++; @@ -3795,6 +3923,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].op = CF_OP_EXPORT; output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; shader->nr_ps_color_exports++; + shader->ps_color_export_mask |= (0xf << (j * 4)); } } } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { @@ -3883,6 +4012,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].op = CF_OP_EXPORT; j++; shader->nr_ps_color_exports++; + shader->ps_color_export_mask = 0xf; } noutput = j; @@ -4395,44 +4525,109 @@ static int egcm_int_to_double(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; - int i, r; - int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + int i, c, r; + int write_mask = inst->Dst[0].Register.WriteMask; + int temp_reg = r600_get_temp(ctx); assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || inst->Instruction.Opcode == TGSI_OPCODE_U2D); - for (i = 0; i <= (lasti+1)/2; i++) { - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ctx->inst_info->op; - - r600_bytecode_src(&alu.src[0], &ctx->src[0], i); - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = i; - alu.dst.write = 1; - alu.last = 1; + for (c = 0; c < 2; c++) { + int dchan = c * 2; + if (write_mask & (0x3 << dchan)) { + /* split into 24-bit int and 8-bit int */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_AND_INT; + alu.dst.sel = temp_reg; + alu.dst.chan = dchan; + r600_bytecode_src(&alu.src[0], &ctx->src[0], c); + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 0xffffff00; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_AND_INT; + alu.dst.sel = temp_reg; + alu.dst.chan = dchan + 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], c); + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 0xff; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } } - for (i = 0; i <= lasti; i++) { - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_FLT32_TO_FLT64; + for (c = 0; c < 2; c++) { + int dchan = c * 2; + if (write_mask & (0x3 << dchan)) { + for (i = dchan; i <= dchan + 1; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT; - alu.src[0].chan = i/2; - if (i%2 == 0) - alu.src[0].sel = ctx->temp_reg; - else { - alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; - alu.src[0].value = 0x0; + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + alu.dst.sel = temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + if (ctx->bc->chip_class == CAYMAN) + alu.last = i == dchan + 1; + else + alu.last = 1; /* trans only ops on evergreen */ + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } } - tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); - alu.last = i == lasti; + } - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; + for (c = 0; c < 2; c++) { + int dchan = c * 2; + if (write_mask & (0x3 << dchan)) { + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_FLT32_TO_FLT64; + + alu.src[0].chan = dchan + (i / 2); + if (i == 0 || i == 2) + alu.src[0].sel = temp_reg; + else { + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0x0; + } + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.last = i == 3; + alu.dst.write = 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i <= 1; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD_64; + + alu.src[0].chan = fp64_switch(i); + alu.src[0].sel = ctx->temp_reg; + + alu.src[1].chan = fp64_switch(i + 2); + alu.src[1].sel = ctx->temp_reg; + tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst); + alu.last = i == 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } } return 0; @@ -6850,7 +7045,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l return 0; } -static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset) +static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; int r; @@ -6876,7 +7071,7 @@ static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offs struct r600_bytecode_vtx vtx; memset(&vtx, 0, sizeof(vtx)); vtx.op = FETCH_OP_GET_BUFFER_RESINFO; - vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; + vtx.buffer_id = id + eg_buffer_base; vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; vtx.src_gpr = 0; vtx.mega_fetch_count = 16; /* no idea here really... */ @@ -6950,7 +7145,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { if (ctx->bc->chip_class < EVERGREEN) ctx->shader->uses_tex_buffers = true; - return r600_do_buffer_txq(ctx, 1, 0); + return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS); } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { if (ctx->bc->chip_class < EVERGREEN) @@ -7302,6 +7497,168 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) } } + if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { + /* Gather4 should follow the same rules as bilinear filtering, but the hardware + * incorrectly forces nearest filtering if the texture format is integer. + * The only effect it has on Gather4, which always returns 4 texels for + * bilinear filtering, is that the final coordinates are off by 0.5 of + * the texel size. + * + * The workaround is to subtract 0.5 from the unnormalized coordinates, + * or (0.5 / size) from the normalized coordinates. + */ + if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT || + inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) { + int treg = r600_get_temp(ctx); + + /* mov array and comparison oordinate to temp_reg if needed */ + if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || + inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || + inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) { + int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2; + for (i = 2; i <= end; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.last = (i == end); + r600_bytecode_src(&alu.src[0], &ctx->src[0], i); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + + if (inst->Texture.Texture == TGSI_TEXTURE_RECT || + inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) { + for (i = 0; i < 2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.last = i == 1; + if (src_loaded) { + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i; + } else + r600_bytecode_src(&alu.src[0], &ctx->src[0], i); + alu.src[1].sel = V_SQ_ALU_SRC_0_5; + alu.src[1].neg = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } else { + /* execute a TXQ */ + memset(&tex, 0, sizeof(struct r600_bytecode_tex)); + tex.op = FETCH_OP_GET_TEXTURE_RESINFO; + tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); + tex.sampler_index_mode = sampler_index_mode; + tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; + tex.resource_index_mode = sampler_index_mode; + tex.dst_gpr = treg; + tex.src_sel_x = 4; + tex.src_sel_y = 4; + tex.src_sel_z = 4; + tex.src_sel_w = 4; + tex.dst_sel_x = 0; + tex.dst_sel_y = 1; + tex.dst_sel_z = 7; + tex.dst_sel_w = 7; + r = r600_bytecode_add_tex(ctx->bc, &tex); + if (r) + return r; + + /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */ + if (ctx->bc->chip_class == CAYMAN) { + /* */ + for (i = 0; i < 2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_INT_TO_FLT; + alu.dst.sel = treg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.src[0].sel = treg; + alu.src[0].chan = i; + alu.last = (i == 1) ? 1 : 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + for (j = 0; j < 2; j++) { + for (i = 0; i < 3; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_RECIP_IEEE; + alu.src[0].sel = treg; + alu.src[0].chan = j; + alu.dst.sel = treg; + alu.dst.chan = i; + if (i == 2) + alu.last = 1; + if (i == j) + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + } else { + for (i = 0; i < 2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_INT_TO_FLT; + alu.dst.sel = treg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.src[0].sel = treg; + alu.src[0].chan = i; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + for (i = 0; i < 2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_RECIP_IEEE; + alu.src[0].sel = treg; + alu.src[0].chan = i; + alu.dst.sel = treg; + alu.dst.chan = i; + alu.last = 1; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + for (i = 0; i < 2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP3_MULADD; + alu.is_op3 = 1; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.last = i == 1; + alu.src[0].sel = treg; + alu.src[0].chan = i; + alu.src[1].sel = V_SQ_ALU_SRC_0_5; + alu.src[1].neg = 1; + if (src_loaded) { + alu.src[2].sel = ctx->temp_reg; + alu.src[2].chan = i; + } else + r600_bytecode_src(&alu.src[2], &ctx->src[0], i); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + src_loaded = TRUE; + src_gpr = ctx->temp_reg; + } + } + if (src_requires_loading && !src_loaded) { for (i = 0; i < 4; i++) { memset(&alu, 0, sizeof(struct r600_bytecode_alu)); @@ -7619,15 +7976,15 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) tex.inst_mod = texture_component_select; if (ctx->bc->chip_class == CAYMAN) { - /* GATHER4 result order is different from TGSI TG4 */ - tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7; - tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7; - tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7; + tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; + tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; + tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; } else { - tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; - tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; - tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; + /* GATHER4 result order is different from TGSI TG4 */ + tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7; + tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7; + tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7; tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; } } @@ -8520,6 +8877,33 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) if (r) return r; + if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) { + if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) { + int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = is_cm ? 2 : 1; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = value; + alu.last = 1; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } else { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = is_cm ? 2 : 1; + r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); + alu.last = 1; + alu.dst.write = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); int abs_value = abs(value); @@ -8559,7 +8943,10 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) gds.src_gpr2 = 0; gds.src_sel_x = is_cm ? 0 : 4; gds.src_sel_y = is_cm ? 1 : 0; - gds.src_sel_z = 7; + if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) + gds.src_sel_z = is_cm ? 2 : 1; + else + gds.src_sel_z = 7; gds.dst_sel_x = 0; gds.dst_sel_y = 7; gds.dst_sel_z = 7; @@ -8664,7 +9051,11 @@ static int tgsi_resq(struct r600_shader_ctx *ctx) (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { if (ctx->bc->chip_class < EVERGREEN) ctx->shader->uses_tex_buffers = true; - return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset); + unsigned eg_buffer_base = 0; + eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET; + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) + eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE]; + return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base); } if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&