struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
 
    const uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2];
-   const unsigned threads = DIV_ROUND_UP(group_size, cs_prog_data->simd_size);
+   const unsigned simd_size =
+      brw_cs_simd_size_for_group_size(devinfo, cs_prog_data, group_size);
+   const unsigned threads = DIV_ROUND_UP(group_size, simd_size);
 
    /* Always pin the binder.  If we're emitting new binding table pointers,
     * we need it.  If not, we're probably inheriting old tables via the
       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
 
       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
-         idd.KernelStartPointer = KSP(shader);
+         idd.KernelStartPointer =
+            KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data, simd_size);
          idd.SamplerStatePointer = shs->sampler_table.offset;
          idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
          idd.NumberofThreadsinGPGPUThreadGroup = threads;
       }
    }
 
-   uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);
+   uint32_t remainder = group_size & (simd_size - 1);
    uint32_t right_mask;
 
    if (remainder > 0)
       right_mask = ~0u >> (32 - remainder);
    else
-      right_mask = ~0u >> (32 - cs_prog_data->simd_size);
+      right_mask = ~0u >> (32 - simd_size);
 
 #define GPGPU_DISPATCHDIMX 0x2500
 #define GPGPU_DISPATCHDIMY 0x2504
 
    iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
       ggw.IndirectParameterEnable    = grid->indirect != NULL;
-      ggw.SIMDSize                   = cs_prog_data->simd_size / 16;
+      ggw.SIMDSize                   = simd_size / 16;
       ggw.ThreadDepthCounterMaximum  = 0;
       ggw.ThreadHeightCounterMaximum = 0;
       ggw.ThreadWidthCounterMaximum  = threads - 1;