src/mesa/drivers/dri/i965/brw_compiler.h

   1 /*
   2  * Copyright © 2010 - 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #pragma once
  25
  26 #include "brw_device_info.h"
  27 #include "main/mtypes.h"
  28
  29 #ifdef __cplusplus
  30 extern "C" {
  31 #endif
  32
  33 struct ra_regs;
  34 struct nir_shader;
  35 struct brw_geometry_program;
  36 union gl_constant_value;
  37
  38 struct brw_compiler {
  39    const struct brw_device_info *devinfo;
  40
  41    struct {
  42       struct ra_regs *regs;
  43
  44       /**
  45        * Array of the ra classes for the unaligned contiguous register
  46        * block sizes used.
  47        */
  48       int *classes;
  49
  50       /**
  51        * Mapping for register-allocated objects in *regs to the first
  52        * GRF for that object.
  53        */
  54       uint8_t *ra_reg_to_grf;
  55    } vec4_reg_set;
  56
  57    struct {
  58       struct ra_regs *regs;
  59
  60       /**
  61        * Array of the ra classes for the unaligned contiguous register
  62        * block sizes used, indexed by register size.
  63        */
  64       int classes[16];
  65
  66       /**
  67        * Mapping from classes to ra_reg ranges.  Each of the per-size
  68        * classes corresponds to a range of ra_reg nodes.  This array stores
  69        * those ranges in the form of first ra_reg in each class and the
  70        * total number of ra_reg elements in the last array element.  This
  71        * way the range of the i'th class is given by:
  72        * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
  73        */
  74       int class_to_ra_reg_range[17];
  75
  76       /**
  77        * Mapping for register-allocated objects in *regs to the first
  78        * GRF for that object.
  79        */
  80       uint8_t *ra_reg_to_grf;
  81
  82       /**
  83        * ra class for the aligned pairs we use for PLN, which doesn't
  84        * appear in *classes.
  85        */
  86       int aligned_pairs_class;
  87    } fs_reg_sets[2];
  88
  89    void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
  90    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
  91
  92    bool scalar_vs;
  93    bool scalar_gs;
  94    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
  95 };
  96
  97 struct brw_compiler *
  98 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
  99
 100
 101 /**
 102  * Program key structures.
 103  *
 104  * When drawing, we look for the currently bound shaders in the program
 105  * cache.  This is essentially a hash table lookup, and these are the keys.
 106  *
 107  * Sometimes OpenGL features specified as state need to be simulated via
 108  * shader code, due to a mismatch between the API and the hardware.  This
 109  * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
 110  * in the program key so it's considered when searching for a program.  If
 111  * we haven't seen a particular combination before, we have to recompile a
 112  * new specialized version.
 113  *
 114  * Shader compilation should not look up state in gl_context directly, but
 115  * instead use the copy in the program key.  This guarantees recompiles will
 116  * happen correctly.
 117  *
 118  *  @{
 119  */
 120
 121 enum PACKED gen6_gather_sampler_wa {
 122    WA_SIGN = 1,      /* whether we need to sign extend */
 123    WA_8BIT = 2,      /* if we have an 8bit format needing wa */
 124    WA_16BIT = 4,     /* if we have a 16bit format needing wa */
 125 };
 126
 127 /**
 128  * Sampler information needed by VS, WM, and GS program cache keys.
 129  */
 130 struct brw_sampler_prog_key_data {
 131    /**
 132     * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
 133     */
 134    uint16_t swizzles[MAX_SAMPLERS];
 135
 136    uint32_t gl_clamp_mask[3];
 137
 138    /**
 139     * For RG32F, gather4's channel select is broken.
 140     */
 141    uint32_t gather_channel_quirk_mask;
 142
 143    /**
 144     * Whether this sampler uses the compressed multisample surface layout.
 145     */
 146    uint32_t compressed_multisample_layout_mask;
 147
 148    /**
 149     * Whether this sampler is using 16x multisampling. If so fetching from
 150     * this sampler will be handled with a different instruction, ld2dms_w
 151     * instead of ld2dms.
 152     */
 153    uint32_t msaa_16;
 154
 155    /**
 156     * For Sandybridge, which shader w/a we need for gather quirks.
 157     */
 158    enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
 159 };
 160
 161
 162 /** The program key for Vertex Shaders. */
 163 struct brw_vs_prog_key {
 164    unsigned program_string_id;
 165
 166    /*
 167     * Per-attribute workaround flags
 168     */
 169    uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX];
 170
 171    bool copy_edgeflag:1;
 172
 173    bool clamp_vertex_color:1;
 174
 175    /**
 176     * How many user clipping planes are being uploaded to the vertex shader as
 177     * push constants.
 178     *
 179     * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
 180     * clip distances.
 181     */
 182    unsigned nr_userclip_plane_consts:4;
 183
 184    /**
 185     * For pre-Gen6 hardware, a bitfield indicating which texture coordinates
 186     * are going to be replaced with point coordinates (as a consequence of a
 187     * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)).  Because
 188     * our SF thread requires exact matching between VS outputs and FS inputs,
 189     * these texture coordinates will need to be unconditionally included in
 190     * the VUE, even if they aren't written by the vertex shader.
 191     */
 192    uint8_t point_coord_replace;
 193
 194    struct brw_sampler_prog_key_data tex;
 195 };
 196
 197 /** The program key for Geometry Shaders. */
 198 struct brw_gs_prog_key
 199 {
 200    unsigned program_string_id;
 201
 202    struct brw_sampler_prog_key_data tex;
 203 };
 204
 205 /** The program key for Fragment/Pixel Shaders. */
 206 struct brw_wm_prog_key {
 207    uint8_t iz_lookup;
 208    bool stats_wm:1;
 209    bool flat_shade:1;
 210    bool persample_shading:1;
 211    bool persample_2x:1;
 212    unsigned nr_color_regions:5;
 213    bool replicate_alpha:1;
 214    bool render_to_fbo:1;
 215    bool clamp_fragment_color:1;
 216    bool compute_pos_offset:1;
 217    bool compute_sample_id:1;
 218    unsigned line_aa:2;
 219    bool high_quality_derivatives:1;
 220
 221    uint16_t drawable_height;
 222    uint64_t input_slots_valid;
 223    unsigned program_string_id;
 224    GLenum alpha_test_func;          /* < For Gen4/5 MRT alpha test */
 225    float alpha_test_ref;
 226
 227    struct brw_sampler_prog_key_data tex;
 228 };
 229
 230 struct brw_cs_prog_key {
 231    uint32_t program_string_id;
 232    struct brw_sampler_prog_key_data tex;
 233 };
 234
 235 /*
 236  * Image metadata structure as laid out in the shader parameter
 237  * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
 238  * able to use them.  That's okay because the padding and any unused
 239  * entries [most of them except when we're doing untyped surface
 240  * access] will be removed by the uniform packing pass.
 241  */
 242 #define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
 243 #define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
 244 #define BRW_IMAGE_PARAM_SIZE_OFFSET             8
 245 #define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
 246 #define BRW_IMAGE_PARAM_TILING_OFFSET           16
 247 #define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
 248 #define BRW_IMAGE_PARAM_SIZE                    24
 249
 250 struct brw_image_param {
 251    /** Surface binding table index. */
 252    uint32_t surface_idx;
 253
 254    /** Offset applied to the X and Y surface coordinates. */
 255    uint32_t offset[2];
 256
 257    /** Surface X, Y and Z dimensions. */
 258    uint32_t size[3];
 259
 260    /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
 261     * pixels, vertical slice stride in pixels.
 262     */
 263    uint32_t stride[4];
 264
 265    /** Log2 of the tiling modulus in the X, Y and Z dimension. */
 266    uint32_t tiling[3];
 267
 268    /**
 269     * Right shift to apply for bit 6 address swizzling.  Two different
 270     * swizzles can be specified and will be applied one after the other.  The
 271     * resulting address will be:
 272     *
 273     *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
 274     *                              (addr >> swizzling[1])))
 275     *
 276     * Use \c 0xff if any of the swizzles is not required.
 277     */
 278    uint32_t swizzling[2];
 279 };
 280
 281 struct brw_stage_prog_data {
 282    struct {
 283       /** size of our binding table. */
 284       uint32_t size_bytes;
 285
 286       /** @{
 287        * surface indices for the various groups of surfaces
 288        */
 289       uint32_t pull_constants_start;
 290       uint32_t texture_start;
 291       uint32_t gather_texture_start;
 292       uint32_t ubo_start;
 293       uint32_t ssbo_start;
 294       uint32_t abo_start;
 295       uint32_t image_start;
 296       uint32_t shader_time_start;
 297       /** @} */
 298    } binding_table;
 299
 300    GLuint nr_params;       /**< number of float params/constants */
 301    GLuint nr_pull_params;
 302    unsigned nr_image_params;
 303
 304    unsigned curb_read_length;
 305    unsigned total_scratch;
 306
 307    /**
 308     * Register where the thread expects to find input data from the URB
 309     * (typically uniforms, followed by vertex or fragment attributes).
 310     */
 311    unsigned dispatch_grf_start_reg;
 312
 313    bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
 314
 315    /* Pointers to tracked values (only valid once
 316     * _mesa_load_state_parameters has been called at runtime).
 317     */
 318    const union gl_constant_value **param;
 319    const union gl_constant_value **pull_param;
 320
 321    /** Image metadata passed to the shader as uniforms. */
 322    struct brw_image_param *image_param;
 323 };
 324
 325 /* Data about a particular attempt to compile a program.  Note that
 326  * there can be many of these, each in a different GL state
 327  * corresponding to a different brw_wm_prog_key struct, with different
 328  * compiled programs.
 329  */
 330 struct brw_wm_prog_data {
 331    struct brw_stage_prog_data base;
 332
 333    GLuint num_varying_inputs;
 334
 335    GLuint dispatch_grf_start_reg_16;
 336    GLuint reg_blocks;
 337    GLuint reg_blocks_16;
 338
 339    struct {
 340       /** @{
 341        * surface indices the WM-specific surfaces
 342        */
 343       uint32_t render_target_start;
 344       /** @} */
 345    } binding_table;
 346
 347    uint8_t computed_depth_mode;
 348    bool computed_stencil;
 349
 350    bool early_fragment_tests;
 351    bool no_8;
 352    bool dual_src_blend;
 353    bool uses_pos_offset;
 354    bool uses_omask;
 355    bool uses_kill;
 356    bool pulls_bary;
 357    uint32_t prog_offset_16;
 358
 359    /**
 360     * Mask of which interpolation modes are required by the fragment shader.
 361     * Used in hardware setup on gen6+.
 362     */
 363    uint32_t barycentric_interp_modes;
 364
 365    /**
 366     * Map from gl_varying_slot to the position within the FS setup data
 367     * payload where the varying's attribute vertex deltas should be delivered.
 368     * For varying slots that are not used by the FS, the value is -1.
 369     */
 370    int urb_setup[VARYING_SLOT_MAX];
 371 };
 372
 373 struct brw_cs_prog_data {
 374    struct brw_stage_prog_data base;
 375
 376    GLuint dispatch_grf_start_reg_16;
 377    unsigned local_size[3];
 378    unsigned simd_size;
 379    bool uses_barrier;
 380    bool uses_num_work_groups;
 381    unsigned local_invocation_id_regs;
 382
 383    struct {
 384       /** @{
 385        * surface indices the CS-specific surfaces
 386        */
 387       uint32_t work_groups_start;
 388       /** @} */
 389    } binding_table;
 390 };
 391
 392 /**
 393  * Enum representing the i965-specific vertex results that don't correspond
 394  * exactly to any element of gl_varying_slot.  The values of this enum are
 395  * assigned such that they don't conflict with gl_varying_slot.
 396  */
 397 typedef enum
 398 {
 399    BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
 400    BRW_VARYING_SLOT_PAD,
 401    /**
 402     * Technically this is not a varying but just a placeholder that
 403     * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
 404     * builtin variable to be compiled correctly. see compile_sf_prog() for
 405     * more info.
 406     */
 407    BRW_VARYING_SLOT_PNTC,
 408    BRW_VARYING_SLOT_COUNT
 409 } brw_varying_slot;
 410
 411 /**
 412  * Data structure recording the relationship between the gl_varying_slot enum
 413  * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
 414  * single octaword within the VUE (128 bits).
 415  *
 416  * Note that each BRW register contains 256 bits (2 octawords), so when
 417  * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
 418  * consecutive VUE slots.  When accessing the VUE in URB_INTERLEAVED mode (as
 419  * in a vertex shader), each register corresponds to a single VUE slot, since
 420  * it contains data for two separate vertices.
 421  */
 422 struct brw_vue_map {
 423    /**
 424     * Bitfield representing all varying slots that are (a) stored in this VUE
 425     * map, and (b) actually written by the shader.  Does not include any of
 426     * the additional varying slots defined in brw_varying_slot.
 427     */
 428    GLbitfield64 slots_valid;
 429
 430    /**
 431     * Is this VUE map for a separate shader pipeline?
 432     *
 433     * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
 434     * without the linker having a chance to dead code eliminate unused varyings.
 435     *
 436     * This means that we have to use a fixed slot layout, based on the output's
 437     * location field, rather than assigning slots in a compact contiguous block.
 438     */
 439    bool separate;
 440
 441    /**
 442     * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
 443     * not stored in a slot (because they are not written, or because
 444     * additional processing is applied before storing them in the VUE), the
 445     * value is -1.
 446     */
 447    signed char varying_to_slot[BRW_VARYING_SLOT_COUNT];
 448
 449    /**
 450     * Map from VUE slot to gl_varying_slot value.  For slots that do not
 451     * directly correspond to a gl_varying_slot, the value comes from
 452     * brw_varying_slot.
 453     *
 454     * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
 455     */
 456    signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
 457
 458    /**
 459     * Total number of VUE slots in use
 460     */
 461    int num_slots;
 462 };
 463
 464 void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
 465
 466 /**
 467  * Convert a VUE slot number into a byte offset within the VUE.
 468  */
 469 static inline GLuint brw_vue_slot_to_offset(GLuint slot)
 470 {
 471    return 16*slot;
 472 }
 473
 474 /**
 475  * Convert a vertex output (brw_varying_slot) into a byte offset within the
 476  * VUE.
 477  */
 478 static inline
 479 GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying)
 480 {
 481    return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
 482 }
 483
 484 void brw_compute_vue_map(const struct brw_device_info *devinfo,
 485                          struct brw_vue_map *vue_map,
 486                          GLbitfield64 slots_valid,
 487                          bool separate_shader);
 488
 489 enum shader_dispatch_mode {
 490    DISPATCH_MODE_4X1_SINGLE = 0,
 491    DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
 492    DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
 493    DISPATCH_MODE_SIMD8 = 3,
 494 };
 495
 496 struct brw_vue_prog_data {
 497    struct brw_stage_prog_data base;
 498    struct brw_vue_map vue_map;
 499
 500    /** Should the hardware deliver input VUE handles for URB pull loads? */
 501    bool include_vue_handles;
 502
 503    GLuint urb_read_length;
 504    GLuint total_grf;
 505
 506    /* Used for calculating urb partitions.  In the VS, this is the size of the
 507     * URB entry used for both input and output to the thread.  In the GS, this
 508     * is the size of the URB entry used for output.
 509     */
 510    GLuint urb_entry_size;
 511
 512    enum shader_dispatch_mode dispatch_mode;
 513 };
 514
 515 struct brw_vs_prog_data {
 516    struct brw_vue_prog_data base;
 517
 518    GLbitfield64 inputs_read;
 519
 520    unsigned nr_attributes;
 521
 522    bool uses_vertexid;
 523    bool uses_instanceid;
 524 };
 525
 526 struct brw_gs_prog_data
 527 {
 528    struct brw_vue_prog_data base;
 529
 530    /**
 531     * Size of an output vertex, measured in HWORDS (32 bytes).
 532     */
 533    unsigned output_vertex_size_hwords;
 534
 535    unsigned output_topology;
 536
 537    /**
 538     * Size of the control data (cut bits or StreamID bits), in hwords (32
 539     * bytes).  0 if there is no control data.
 540     */
 541    unsigned control_data_header_size_hwords;
 542
 543    /**
 544     * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
 545     * if the control data is StreamID bits, or
 546     * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
 547     * Ignored if control_data_header_size is 0.
 548     */
 549    unsigned control_data_format;
 550
 551    bool include_primitive_id;
 552
 553    /**
 554     * The number of vertices emitted, if constant - otherwise -1.
 555     */
 556    int static_vertex_count;
 557
 558    int invocations;
 559
 560    /**
 561     * Gen6 transform feedback enabled flag.
 562     */
 563    bool gen6_xfb_enabled;
 564
 565    /**
 566     * Gen6: Provoking vertex convention for odd-numbered triangles
 567     * in tristrips.
 568     */
 569    GLuint pv_first:1;
 570
 571    /**
 572     * Gen6: Number of varyings that are output to transform feedback.
 573     */
 574    GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
 575
 576    /**
 577     * Gen6: Map from the index of a transform feedback binding table entry to the
 578     * gl_varying_slot that should be streamed out through that binding table
 579     * entry.
 580     */
 581    unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */];
 582
 583    /**
 584     * Gen6: Map from the index of a transform feedback binding table entry to the
 585     * swizzles that should be used when streaming out data through that
 586     * binding table entry.
 587     */
 588    unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
 589 };
 590
 591
 592 /** @} */
 593
 594 /**
 595  * Compile a vertex shader.
 596  *
 597  * Returns the final assembly and the program's size.
 598  */
 599 const unsigned *
 600 brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
 601                void *mem_ctx,
 602                const struct brw_vs_prog_key *key,
 603                struct brw_vs_prog_data *prog_data,
 604                const struct nir_shader *shader,
 605                gl_clip_plane *clip_planes,
 606                bool use_legacy_snorm_formula,
 607                int shader_time_index,
 608                unsigned *final_assembly_size,
 609                char **error_str);
 610
 611 /**
 612  * Compile a vertex shader.
 613  *
 614  * Returns the final assembly and the program's size.
 615  */
 616 const unsigned *
 617 brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
 618                void *mem_ctx,
 619                const struct brw_gs_prog_key *key,
 620                struct brw_gs_prog_data *prog_data,
 621                const struct nir_shader *shader,
 622                struct gl_shader_program *shader_prog,
 623                int shader_time_index,
 624                unsigned *final_assembly_size,
 625                char **error_str);
 626
 627 /**
 628  * Compile a fragment shader.
 629  *
 630  * Returns the final assembly and the program's size.
 631  */
 632 const unsigned *
 633 brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
 634                void *mem_ctx,
 635                const struct brw_wm_prog_key *key,
 636                struct brw_wm_prog_data *prog_data,
 637                const struct nir_shader *shader,
 638                struct gl_program *prog,
 639                int shader_time_index8,
 640                int shader_time_index16,
 641                bool use_rep_send,
 642                unsigned *final_assembly_size,
 643                char **error_str);
 644
 645 /**
 646  * Compile a compute shader.
 647  *
 648  * Returns the final assembly and the program's size.
 649  */
 650 const unsigned *
 651 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
 652                void *mem_ctx,
 653                const struct brw_cs_prog_key *key,
 654                struct brw_cs_prog_data *prog_data,
 655                const struct nir_shader *shader,
 656                int shader_time_index,
 657                unsigned *final_assembly_size,
 658                char **error_str);
 659
 660 #ifdef __cplusplus
 661 } /* extern "C" */
 662 #endif