src/gallium/drivers/ilo/core/ilo_state_compute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  *
   4  * Copyright (C) 2012-2015 LunarG, Inc.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included
  14  * in all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Chia-I Wu <olv@lunarg.com>
  26  */
  27
  28 #include "ilo_debug.h"
  29 #include "ilo_state_compute.h"
  30
  31 struct compute_urb_configuration {
  32    int idrt_entry_count;
  33    int curbe_entry_count;
  34
  35    int urb_entry_count;
  36    /* in 256-bit register increments */
  37    int urb_entry_size;
  38 };
  39
  40 static int
  41 get_gen6_rob_entry_count(const struct ilo_dev *dev)
  42 {
  43    ILO_DEV_ASSERT(dev, 6, 8);
  44
  45    /*
  46     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
  47     *
  48     *     "ROB has 64KB of storage; 2048 entries."
  49     *
  50     * From the valid ranges of "CURBE Allocation Size", we can also conclude
  51     * that interface entries and CURBE data must be in ROB.  And that ROB
  52     * should be 16KB, or 512 entries, on Gen7 GT1.
  53     */
  54    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
  55       return 2048;
  56    else if (ilo_dev_gen(dev) >= ILO_GEN(7))
  57       return (dev->gt == 2) ? 2048 : 512;
  58    else
  59       return (dev->gt == 2) ? 2048 : 1024;
  60 }
  61
  62 static int
  63 get_gen6_idrt_entry_count(const struct ilo_dev *dev)
  64 {
  65    ILO_DEV_ASSERT(dev, 6, 8);
  66
  67    /*
  68     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
  69     *
  70     *     "The first 32 URB entries are reserved for the interface
  71     *      descriptor..."
  72     *
  73     * From the Haswell PRM, volume 7, page 836:
  74     *
  75     *     "The first 64 URB entries are reserved for the interface
  76     *      description..."
  77     */
  78    return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
  79 }
  80
  81 static int
  82 get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
  83 {
  84    /*
  85     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
  86     *
  87     *     "(CURBE Allocation Size) Specifies the total length allocated for
  88     *      CURBE, in 256-bit register increments.
  89     */
  90    const int entry_count = (curbe_size + 31) / 32;
  91
  92    ILO_DEV_ASSERT(dev, 6, 8);
  93
  94    assert(get_gen6_idrt_entry_count(dev) + entry_count <=
  95          get_gen6_rob_entry_count(dev));
  96
  97    return entry_count;
  98 }
  99
 100 static bool
 101 compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
 102                                    const struct ilo_state_compute_info *info,
 103                                    struct compute_urb_configuration *urb)
 104 {
 105    ILO_DEV_ASSERT(dev, 6, 8);
 106
 107    urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
 108    urb->curbe_entry_count =
 109       get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
 110
 111    /*
 112     * From the Broadwell PRM, volume 2b, page 451:
 113     *
 114     *     "Please note that 0 is not allowed for this field (Number of URB
 115     *      Entries)."
 116     */
 117    urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
 118
 119    /*
 120     * From the Ivy Bridge PRM, volume 2 part 2, page 52:
 121     *
 122     *     "(URB Entry Allocation Size) Specifies the length of each URB entry
 123     *      used by the unit, in 256-bit register increments - 1."
 124     */
 125    urb->urb_entry_size = 1;
 126
 127    /*
 128     * From the Ivy Bridge PRM, volume 2 part 2, page 22:
 129     *
 130     *      MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
 131     *      size and the number of URB handles. The driver must ensure that
 132     *      ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
 133     *      URB_allocation_in_L3."
 134     */
 135    assert(urb->idrt_entry_count + urb->curbe_entry_count +
 136          urb->urb_entry_count * urb->urb_entry_size <=
 137          info->cv_urb_alloc_size / 32);
 138
 139    return true;
 140 }
 141
 142 static int
 143 compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
 144                                     const struct ilo_state_compute_interface_info *interface)
 145 {
 146    const int per_thread_read = (interface->curbe_read_length + 31) / 32;
 147    const int cross_thread_read =
 148       (interface->cross_thread_curbe_read_length + 31) / 32;
 149
 150    ILO_DEV_ASSERT(dev, 6, 8);
 151
 152    assert(interface->curbe_read_offset % 32 == 0);
 153
 154    /*
 155     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
 156     *
 157     *     "(Constant URB Entry Read Length) [0,63]"
 158     */
 159    assert(per_thread_read <= 63);
 160
 161    /*
 162     * From the Haswell PRM, volume 2d, page 199:
 163     *
 164     *     "(Cross-Thread Constant Data Read Length) [0,127]"
 165     */
 166    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
 167       assert(cross_thread_read <= 127);
 168    else
 169       assert(!cross_thread_read);
 170
 171    if (per_thread_read || cross_thread_read) {
 172       return interface->curbe_read_offset / 32 + cross_thread_read +
 173          per_thread_read * interface->thread_group_size;
 174    } else {
 175       return 0;
 176    }
 177 }
 178
 179 static bool
 180 compute_validate_gen6(const struct ilo_dev *dev,
 181                       const struct ilo_state_compute_info *info,
 182                       const struct compute_urb_configuration *urb)
 183 {
 184    int min_curbe_entry_count;
 185    uint8_t i;
 186
 187    ILO_DEV_ASSERT(dev, 6, 8);
 188
 189    assert(info->interface_count <= urb->idrt_entry_count);
 190
 191    min_curbe_entry_count = 0;
 192    for (i = 0; i < info->interface_count; i++) {
 193       const int read_end =
 194          compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
 195
 196       if (min_curbe_entry_count < read_end)
 197          min_curbe_entry_count = read_end;
 198    }
 199
 200    assert(min_curbe_entry_count <= urb->curbe_entry_count);
 201
 202    /*
 203     * From the Broadwell PRM, volume 2b, page 452:
 204     *
 205     *     "CURBE Allocation Size should be 0 for GPGPU workloads that uses
 206     *      indirect instead of CURBE."
 207     */
 208    if (!min_curbe_entry_count)
 209       assert(!urb->curbe_entry_count);
 210
 211    return true;
 212 }
 213
 214 static uint32_t
 215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
 216                                          const struct ilo_state_compute_info *info,
 217                                          uint8_t *per_thread_space)
 218 {
 219    ILO_DEV_ASSERT(dev, 6, 7);
 220
 221    /*
 222     * From the Sandy Bridge PRM, volume 2 part 2, page 30:
 223     *
 224     *     "(Per Thread Scratch Space)
 225     *      Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
 226     */
 227    assert(info->per_thread_scratch_size <= 12 * 1024);
 228
 229    if (!info->per_thread_scratch_size) {
 230       *per_thread_space = 0;
 231       return 0;
 232    }
 233
 234    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
 235       (info->per_thread_scratch_size - 1) / 1024 : 0;
 236
 237    return 1024 * (1 + *per_thread_space);
 238 }
 239
 240 static uint32_t
 241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
 242                                           const struct ilo_state_compute_info *info,
 243                                           uint8_t *per_thread_space)
 244 {
 245    ILO_DEV_ASSERT(dev, 7.5, 8);
 246
 247    /*
 248     * From the Haswell PRM, volume 2b, page 407:
 249     *
 250     *     "(Per Thread Scratch Space)
 251     *      [0,10]  Indicating [2k bytes, 2 Mbytes]"
 252     *
 253     *     "Note: The scratch space should be declared as 2x the desired
 254     *      scratch space. The stack will start at the half-way point instead
 255     *      of the end. The upper half of scratch space will not be accessed
 256     *      and so does not have to be allocated in memory."
 257     *
 258     * From the Broadwell PRM, volume 2a, page 450:
 259     *
 260     *     "(Per Thread Scratch Space)
 261     *      [0,11]  indicating [1k bytes, 2 Mbytes]"
 262     */
 263    assert(info->per_thread_scratch_size <=
 264          ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
 265
 266    if (!info->per_thread_scratch_size) {
 267       *per_thread_space = 0;
 268       return 0;
 269    }
 270
 271    /* next power of two, starting from 1KB */
 272    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
 273       (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
 274
 275    return 1 << (10 + *per_thread_space);
 276 }
 277
 278 static bool
 279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
 280                                  const struct ilo_dev *dev,
 281                                  const struct ilo_state_compute_info *info)
 282 {
 283    struct compute_urb_configuration urb;
 284    uint32_t per_thread_size;
 285    uint8_t per_thread_space;
 286
 287    uint32_t dw1, dw2, dw4;
 288
 289    ILO_DEV_ASSERT(dev, 6, 8);
 290
 291    if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
 292        !compute_validate_gen6(dev, info, &urb))
 293       return false;
 294
 295    if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
 296       per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
 297             info, &per_thread_space);
 298    } else {
 299       per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
 300             info, &per_thread_space);
 301    }
 302
 303    dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
 304
 305    dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
 306          urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
 307          GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
 308          GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
 309
 310    if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
 311       dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
 312
 313    assert(urb.urb_entry_size);
 314
 315    dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
 316          urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
 317
 318    STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
 319    compute->vfe[0] = dw1;
 320    compute->vfe[1] = dw2;
 321    compute->vfe[2] = dw4;
 322
 323    compute->scratch_size = per_thread_size * dev->thread_count;
 324
 325    return true;
 326 }
 327
 328 static uint8_t
 329 compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
 330                                          const struct ilo_state_compute_interface_info *interface)
 331 {
 332    ILO_DEV_ASSERT(dev, 6, 8);
 333    return (interface->sampler_count <= 12) ?
 334       (interface->sampler_count + 3) / 4 : 4;
 335 }
 336
 337 static uint8_t
 338 compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
 339                                          const struct ilo_state_compute_interface_info *interface)
 340 {
 341    ILO_DEV_ASSERT(dev, 6, 8);
 342    return (interface->surface_count <= 31) ? interface->surface_count : 31;
 343 }
 344
 345 static uint8_t
 346 compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
 347                                     const struct ilo_state_compute_interface_info *interface)
 348 {
 349    ILO_DEV_ASSERT(dev, 7, 8);
 350
 351    /*
 352     * From the Ivy Bridge PRM, volume 2 part 2, page 61:
 353     *
 354     *     "The amount is specified in 4k blocks, but only powers of 2 are
 355     *      allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
 356     */
 357    assert(interface->slm_size <= 64 * 1024);
 358
 359    return util_next_power_of_two((interface->slm_size + 4095) / 4096);
 360 }
 361
 362 static bool
 363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
 364                                            const struct ilo_dev *dev,
 365                                            const struct ilo_state_compute_info *info)
 366 {
 367    uint8_t i;
 368
 369    ILO_DEV_ASSERT(dev, 6, 8);
 370
 371    for (i = 0; i < info->interface_count; i++) {
 372       const struct ilo_state_compute_interface_info *interface =
 373          &info->interfaces[i];
 374       uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
 375       uint8_t sampler_count, surface_count;
 376       uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
 377
 378       assert(interface->kernel_offset % 64 == 0);
 379       assert(interface->thread_group_size);
 380
 381       read_offset = interface->curbe_read_offset / 32;
 382       per_thread_read_len = (interface->curbe_read_length + 31) / 32;
 383       cross_thread_read_len =
 384          (interface->cross_thread_curbe_read_length + 31) / 32;
 385
 386       sampler_count =
 387          compute_interface_get_gen6_sampler_count(dev, interface);
 388       surface_count =
 389          compute_interface_get_gen6_surface_count(dev, interface);
 390
 391       dw0 = interface->kernel_offset;
 392       dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
 393       dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
 394       dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
 395             read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
 396
 397       dw5 = 0;
 398       dw6 = 0;
 399       if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
 400          const uint8_t slm_size =
 401             compute_interface_get_gen7_slm_size(dev, interface);
 402
 403          dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
 404
 405          if (slm_size) {
 406             dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
 407                    slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
 408          }
 409
 410          /*
 411           * From the Haswell PRM, volume 2d, page 199:
 412           *
 413           *     "(Number of Threads in GPGPU Thread Group) Specifies the
 414           *      number of threads that are in this thread group.  Used to
 415           *      program the barrier for the number of messages to expect. The
 416           *      minimum value is 0 (which will disable the barrier), while
 417           *      the maximum value is the number of threads in a subslice for
 418           *      local barriers."
 419           *
 420           * From the Broadwell PRM, volume 2d, page 183:
 421           *
 422           *     "(Number of Threads in GPGPU Thread Group) Specifies the
 423           *      number of threads that are in this thread group.  The minimum
 424           *      value is 1, while the maximum value is the number of threads
 425           *      in a subslice for local barriers. See vol1b Configurations
 426           *      for the number of threads per subslice for different
 427           *      products.  The maximum value for global barriers is limited
 428           *      by the number of threads in the system, or by 511, whichever
 429           *      is lower. This field should not be set to 0 even if the
 430           *      barrier is disabled, since an accurate value is needed for
 431           *      proper pre-emption."
 432           */
 433          if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
 434             dw5 |= interface->thread_group_size <<
 435                GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
 436          }
 437
 438          if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
 439             dw6 |= cross_thread_read_len <<
 440                GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
 441          }
 442       }
 443
 444       STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
 445       compute->idrt[i][0] = dw0;
 446       compute->idrt[i][1] = dw2;
 447       compute->idrt[i][2] = dw3;
 448       compute->idrt[i][3] = dw4;
 449       compute->idrt[i][4] = dw5;
 450       compute->idrt[i][5] = dw6;
 451    }
 452
 453    return true;
 454 }
 455
 456 bool
 457 ilo_state_compute_init(struct ilo_state_compute *compute,
 458                        const struct ilo_dev *dev,
 459                        const struct ilo_state_compute_info *info)
 460 {
 461    bool ret = true;
 462
 463    assert(ilo_is_zeroed(compute, sizeof(*compute)));
 464    assert(ilo_is_zeroed(info->data, info->data_size));
 465
 466    assert(ilo_state_compute_data_size(dev, info->interface_count) <=
 467          info->data_size);
 468    compute->idrt = (uint32_t (*)[6]) info->data;
 469
 470    ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
 471    ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
 472
 473    assert(ret);
 474
 475    return ret;
 476 }