libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c

   1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
   2    vector_length.  */
   3
   4 /* See also '../libgomp.oacc-fortran/parallel-dims.f90'.  */
   5
   6 #include <limits.h>
   7 #include <openacc.h>
   8 #include <gomp-constants.h>
   9
  10 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
  11    not behaving as expected for -O0.  */
  12 #pragma acc routine seq
  13 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
  14 {
  15   if (acc_on_device ((int) acc_device_host))
  16     return 0;
  17   else if (acc_on_device ((int) acc_device_nvidia)
  18            || acc_on_device ((int) acc_device_radeon))
  19     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
  20   else
  21     __builtin_abort ();
  22 }
  23
  24 #pragma acc routine seq
  25 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
  26 {
  27   if (acc_on_device ((int) acc_device_host))
  28     return 0;
  29   else if (acc_on_device ((int) acc_device_nvidia)
  30            || acc_on_device ((int) acc_device_radeon))
  31     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
  32   else
  33     __builtin_abort ();
  34 }
  35
  36 #pragma acc routine seq
  37 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
  38 {
  39   if (acc_on_device ((int) acc_device_host))
  40     return 0;
  41   else if (acc_on_device ((int) acc_device_nvidia)
  42            || acc_on_device ((int) acc_device_radeon))
  43     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
  44   else
  45     __builtin_abort ();
  46 }
  47
  48
  49 int main ()
  50 {
  51   acc_init (acc_device_default);
  52
  53   /* OpenACC parallel construct.  */
  54
  55   /* Non-positive value.  */
  56
  57   /* GR, WS, VS.  */
  58   {
  59 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
  60     int gangs_actual = GANGS;
  61     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
  62     gangs_min = workers_min = vectors_min = INT_MAX;
  63     gangs_max = workers_max = vectors_max = INT_MIN;
  64 #pragma acc parallel copy (gangs_actual) \
  65   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  66   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
  67     {
  68       /* We're actually executing with num_gangs (1).  */
  69       gangs_actual = 1;
  70       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
  71         {
  72           /* <https://gcc.gnu.org/PR80547>.  */
  73 #if 0
  74           gangs_min = gangs_max = acc_gang ();
  75           workers_min = workers_max = acc_worker ();
  76           vectors_min = vectors_max = acc_vector ();
  77 #else
  78           int gangs = acc_gang ();
  79           gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
  80           gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
  81           int workers = acc_worker ();
  82           workers_min = (workers_min < workers) ? workers_min : workers;
  83           workers_max = (workers_max > workers) ? workers_max : workers;
  84           int vectors = acc_vector ();
  85           vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
  86           vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
  87 #endif
  88         }
  89     }
  90     if (gangs_actual != 1)
  91       __builtin_abort ();
  92     if (gangs_min != 0 || gangs_max != gangs_actual - 1
  93         || workers_min != 0 || workers_max != 0
  94         || vectors_min != 0 || vectors_max != 0)
  95       __builtin_abort ();
  96 #undef GANGS
  97   }
  98
  99   /* GP, WS, VS.  */
 100   {
 101 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
 102     int gangs_actual = GANGS;
 103     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 104     gangs_min = workers_min = vectors_min = INT_MAX;
 105     gangs_max = workers_max = vectors_max = INT_MIN;
 106 #pragma acc parallel copy (gangs_actual) \
 107   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
 108     {
 109       /* We're actually executing with num_gangs (1).  */
 110       gangs_actual = 1;
 111 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 112       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 113         {
 114           gangs_min = gangs_max = acc_gang ();
 115           workers_min = workers_max = acc_worker ();
 116           vectors_min = vectors_max = acc_vector ();
 117         }
 118     }
 119     if (gangs_actual != 1)
 120       __builtin_abort ();
 121     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 122         || workers_min != 0 || workers_max != 0
 123         || vectors_min != 0 || vectors_max != 0)
 124       __builtin_abort ();
 125 #undef GANGS
 126   }
 127
 128   /* GR, WP, VS.  */
 129   {
 130 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
 131     int workers_actual = WORKERS;
 132     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 133     gangs_min = workers_min = vectors_min = INT_MAX;
 134     gangs_max = workers_max = vectors_max = INT_MIN;
 135 #pragma acc parallel copy (workers_actual) \
 136   num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
 137     {
 138       /* We're actually executing with num_workers (1).  */
 139       workers_actual = 1;
 140 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 141       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 142         {
 143           gangs_min = gangs_max = acc_gang ();
 144           workers_min = workers_max = acc_worker ();
 145           vectors_min = vectors_max = acc_vector ();
 146         }
 147     }
 148     if (workers_actual != 1)
 149       __builtin_abort ();
 150     if (gangs_min != 0 || gangs_max != 0
 151         || workers_min != 0 || workers_max != workers_actual - 1
 152         || vectors_min != 0 || vectors_max != 0)
 153       __builtin_abort ();
 154 #undef WORKERS
 155   }
 156
 157   /* GR, WS, VP.  */
 158   {
 159 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
 160     int vectors_actual = VECTORS;
 161     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 162     gangs_min = workers_min = vectors_min = INT_MAX;
 163     gangs_max = workers_max = vectors_max = INT_MIN;
 164 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
 165   vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
 166     {
 167       /* We're actually executing with vector_length (1), just the GCC nvptx
 168          back end enforces vector_length (32).  */
 169       if (acc_on_device (acc_device_nvidia))
 170         vectors_actual = 32;
 171       else
 172         vectors_actual = 1;
 173 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 174       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 175         {
 176           gangs_min = gangs_max = acc_gang ();
 177           workers_min = workers_max = acc_worker ();
 178           vectors_min = vectors_max = acc_vector ();
 179         }
 180     }
 181     if (acc_get_device_type () == acc_device_nvidia)
 182       {
 183         if (vectors_actual != 32)
 184           __builtin_abort ();
 185       }
 186     else
 187       if (vectors_actual != 1)
 188         __builtin_abort ();
 189     if (gangs_min != 0 || gangs_max != 0
 190         || workers_min != 0 || workers_max != 0
 191         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 192       __builtin_abort ();
 193 #undef VECTORS
 194   }
 195
 196
 197   /* High value.  */
 198
 199   /* GR, WS, VS.  */
 200   {
 201     /* There is no actual limit for the number of gangs, so we try with a
 202        rather high value.  */
 203     int gangs = 12345;
 204     int gangs_actual = gangs;
 205     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 206     gangs_min = workers_min = vectors_min = INT_MAX;
 207     gangs_max = workers_max = vectors_max = INT_MIN;
 208 #pragma acc parallel copy (gangs_actual) \
 209   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
 210   num_gangs (gangs)
 211     {
 212       if (acc_on_device (acc_device_host))
 213         {
 214           /* We're actually executing with num_gangs (1).  */
 215           gangs_actual = 1;
 216         }
 217       /* As we're executing GR not GP, don't multiply with a "gangs_actual"
 218          factor.  */
 219       for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
 220         {
 221           gangs_min = gangs_max = acc_gang ();
 222           workers_min = workers_max = acc_worker ();
 223           vectors_min = vectors_max = acc_vector ();
 224         }
 225     }
 226     if (gangs_actual < 1)
 227       __builtin_abort ();
 228     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 229         || workers_min != 0 || workers_max != 0
 230         || vectors_min != 0 || vectors_max != 0)
 231       __builtin_abort ();
 232   }
 233
 234   /* GP, WS, VS.  */
 235   {
 236     /* There is no actual limit for the number of gangs, so we try with a
 237        rather high value.  */
 238     int gangs = 12345;
 239     int gangs_actual = gangs;
 240     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 241     gangs_min = workers_min = vectors_min = INT_MAX;
 242     gangs_max = workers_max = vectors_max = INT_MIN;
 243 #pragma acc parallel copy (gangs_actual) \
 244   num_gangs (gangs)
 245     {
 246       if (acc_on_device (acc_device_host))
 247         {
 248           /* We're actually executing with num_gangs (1).  */
 249           gangs_actual = 1;
 250         }
 251 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 252       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 253         {
 254           gangs_min = gangs_max = acc_gang ();
 255           workers_min = workers_max = acc_worker ();
 256           vectors_min = vectors_max = acc_vector ();
 257         }
 258     }
 259     if (gangs_actual < 1)
 260       __builtin_abort ();
 261     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 262         || workers_min != 0 || workers_max != 0
 263         || vectors_min != 0 || vectors_max != 0)
 264       __builtin_abort ();
 265   }
 266
 267   /* GR, WP, VS.  */
 268   {
 269     /* We try with an outrageously large value. */
 270 #define WORKERS 2 << 20
 271     int workers_actual = WORKERS;
 272     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 273     gangs_min = workers_min = vectors_min = INT_MAX;
 274     gangs_max = workers_max = vectors_max = INT_MIN;
 275 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
 276   num_workers (WORKERS)
 277     {
 278       if (acc_on_device (acc_device_host))
 279         {
 280           /* We're actually executing with num_workers (1).  */
 281           workers_actual = 1;
 282         }
 283       else if (acc_on_device (acc_device_nvidia))
 284         {
 285           /* The GCC nvptx back end enforces num_workers (32).  */
 286           workers_actual = 32;
 287         }
 288       else if (acc_on_device (acc_device_radeon))
 289         {
 290           /* The GCC GCN back end is limited to num_workers (16).
 291              Temporarily set this to 1 until multiple workers are permitted. */
 292           workers_actual = 1; // 16;
 293         }
 294       else
 295         __builtin_abort ();
 296 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 297       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 298         {
 299           gangs_min = gangs_max = acc_gang ();
 300           workers_min = workers_max = acc_worker ();
 301           vectors_min = vectors_max = acc_vector ();
 302         }
 303     }
 304     if (workers_actual < 1)
 305       __builtin_abort ();
 306     if (gangs_min != 0 || gangs_max != 0
 307         || workers_min != 0 || workers_max != workers_actual - 1
 308         || vectors_min != 0 || vectors_max != 0)
 309       __builtin_abort ();
 310 #undef WORKERS
 311   }
 312
 313   /* GR, WP, VS.  */
 314   {
 315     /* We try with an outrageously large value. */
 316     int workers = 2 << 20;
 317     /* For nvptx offloading, this one will not result in "using num_workers
 318        (32), ignoring runtime setting", and will in fact try to launch with
 319        "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
 320        error: invalid argument".  So, limit ourselves here.  */
 321     if (acc_get_device_type () == acc_device_nvidia)
 322       workers = 32;
 323     int workers_actual = workers;
 324     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 325     gangs_min = workers_min = vectors_min = INT_MAX;
 326     gangs_max = workers_max = vectors_max = INT_MIN;
 327 #pragma acc parallel copy (workers_actual) \
 328   num_workers (workers)
 329     {
 330       if (acc_on_device (acc_device_host))
 331         {
 332           /* We're actually executing with num_workers (1).  */
 333           workers_actual = 1;
 334         }
 335       else if (acc_on_device (acc_device_nvidia))
 336         {
 337           /* We're actually executing with num_workers (32).  */
 338           /* workers_actual = 32; */
 339         }
 340       else if (acc_on_device (acc_device_radeon))
 341         {
 342           /* The GCC GCN back end is limited to num_workers (16).  */
 343           workers_actual = 16;
 344         }
 345       else
 346         __builtin_abort ();
 347 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 348       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 349         {
 350           gangs_min = gangs_max = acc_gang ();
 351           workers_min = workers_max = acc_worker ();
 352           vectors_min = vectors_max = acc_vector ();
 353         }
 354     }
 355     if (workers_actual < 1)
 356       __builtin_abort ();
 357     if (gangs_min != 0 || gangs_max != 0
 358         || workers_min != 0 || workers_max != workers_actual - 1
 359         || vectors_min != 0 || vectors_max != 0)
 360       __builtin_abort ();
 361   }
 362
 363   /* GR, WS, VP.  */
 364   {
 365     /* We try with an outrageously large value. */
 366 #define VECTORS 2 << 20
 367     int vectors_actual = VECTORS;
 368     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 369     gangs_min = workers_min = vectors_min = INT_MAX;
 370     gangs_max = workers_max = vectors_max = INT_MIN;
 371 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(1024\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
 372   vector_length (VECTORS)
 373     {
 374       if (acc_on_device (acc_device_host))
 375         {
 376           /* We're actually executing with vector_length (1).  */
 377           vectors_actual = 1;
 378         }
 379       else if (acc_on_device (acc_device_nvidia))
 380         {
 381           /* The GCC nvptx back end enforces vector_length (32).  */
 382           vectors_actual = 1024;
 383         }
 384       else if (acc_on_device (acc_device_radeon))
 385         {
 386           /* The GCC GCN back end enforces vector_length (1): autovectorize. */
 387           vectors_actual = 1;
 388         }
 389       else
 390         __builtin_abort ();
 391 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 392       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 393         {
 394           gangs_min = gangs_max = acc_gang ();
 395           workers_min = workers_max = acc_worker ();
 396           vectors_min = vectors_max = acc_vector ();
 397         }
 398     }
 399     if (vectors_actual < 1)
 400       __builtin_abort ();
 401     if (gangs_min != 0 || gangs_max != 0
 402         || workers_min != 0 || workers_max != 0
 403         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 404       __builtin_abort ();
 405 #undef VECTORS
 406   }
 407
 408   /* GR, WS, VP.  */
 409   {
 410     /* We try with an outrageously large value. */
 411     int vectors = 2 << 20;
 412     int vectors_actual = vectors;
 413     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 414     gangs_min = workers_min = vectors_min = INT_MAX;
 415     gangs_max = workers_max = vectors_max = INT_MIN;
 416 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_selected } } */ \
 417   vector_length (vectors)
 418     {
 419       if (acc_on_device (acc_device_host))
 420         {
 421           /* We're actually executing with vector_length (1).  */
 422           vectors_actual = 1;
 423         }
 424       else if (acc_on_device (acc_device_nvidia))
 425         {
 426           /* The GCC nvptx back end enforces vector_length (32).  */
 427           vectors_actual = 32;
 428         }
 429       else if (acc_on_device (acc_device_radeon))
 430         {
 431           /* Because of the way vectors are implemented for GCN, a vector loop
 432              containing a seq routine call will not vectorize calls to that
 433              routine.  Hence, we'll only get one "vector".  */
 434           vectors_actual = 1;
 435         }
 436       else
 437         __builtin_abort ();
 438 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 439       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 440         {
 441           gangs_min = gangs_max = acc_gang ();
 442           workers_min = workers_max = acc_worker ();
 443           vectors_min = vectors_max = acc_vector ();
 444         }
 445     }
 446     if (vectors_actual < 1)
 447       __builtin_abort ();
 448     if (gangs_min != 0 || gangs_max != 0
 449         || workers_min != 0 || workers_max != 0
 450         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 451       __builtin_abort ();
 452   }
 453
 454
 455   /* Composition of GP, WP, VP.  */
 456   {
 457     int gangs = 12345;
 458     /* With nvptx offloading, multi-level reductions apparently are very slow
 459        in the following case.  So, limit ourselves here.  */
 460     if (acc_get_device_type () == acc_device_nvidia)
 461       gangs = 3;
 462     /* Similar appears to be true for GCN.  */
 463     if (acc_get_device_type () == acc_device_radeon)
 464       gangs = 3;
 465     int gangs_actual = gangs;
 466 #define WORKERS 3
 467     int workers_actual = WORKERS;
 468 #define VECTORS 11
 469     int vectors_actual = VECTORS;
 470     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 471     gangs_min = workers_min = vectors_min = INT_MAX;
 472     gangs_max = workers_max = vectors_max = INT_MIN;
 473 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_selected } } */ \
 474   num_gangs (gangs) \
 475   num_workers (WORKERS) \
 476   vector_length (VECTORS)
 477     {
 478       if (acc_on_device (acc_device_host))
 479         {
 480           /* We're actually executing with num_gangs (1), num_workers (1),
 481              vector_length (1).  */
 482           gangs_actual = 1;
 483           workers_actual = 1;
 484           vectors_actual = 1;
 485         }
 486       else if (acc_on_device (acc_device_nvidia))
 487         {
 488           /* The GCC nvptx back end enforces vector_length (32).  */
 489           vectors_actual = 32;
 490         }
 491       else if (acc_on_device (acc_device_radeon))
 492         {
 493           /* Temporary setting, until multiple workers are permitted.  */
 494           workers_actual = 1;
 495           /* See above comments about GCN vectors_actual.  */
 496           vectors_actual = 1;
 497         }
 498       else
 499         __builtin_abort ();
 500 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 501       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 502 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 503         for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
 504 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 505           for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
 506             {
 507               gangs_min = gangs_max = acc_gang ();
 508               workers_min = workers_max = acc_worker ();
 509               vectors_min = vectors_max = acc_vector ();
 510             }
 511     }
 512     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 513         || workers_min != 0 || workers_max != workers_actual - 1
 514         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 515       __builtin_abort ();
 516 #undef VECTORS
 517 #undef WORKERS
 518   }
 519
 520
 521   /* OpenACC kernels construct.  */
 522
 523   /* We can't test parallelized OpenACC kernels constructs in this way: use of
 524      the acc_gang, acc_worker, acc_vector functions will make the construct
 525      unparallelizable.  */
 526
 527
 528   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
 529      kernels.  */
 530   {
 531     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 532     gangs_min = workers_min = vectors_min = INT_MAX;
 533     gangs_max = workers_max = vectors_max = INT_MIN;
 534 #pragma acc kernels
 535     {
 536       /* This is to make the OpenACC kernels construct unparallelizable.  */
 537       asm volatile ("" : : : "memory");
 538
 539 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 540       for (int i = 100; i > -100; --i)
 541         {
 542           gangs_min = gangs_max = acc_gang ();
 543           workers_min = workers_max = acc_worker ();
 544           vectors_min = vectors_max = acc_vector ();
 545         }
 546     }
 547     if (gangs_min != 0 || gangs_max != 1 - 1
 548         || workers_min != 0 || workers_max != 1 - 1
 549         || vectors_min != 0 || vectors_max != 1 - 1)
 550       __builtin_abort ();
 551   }
 552
 553
 554   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
 555      kernels even when there are explicit num_gangs, num_workers, or
 556      vector_length clauses.  */
 557   {
 558     int gangs = 5;
 559 #define WORKERS 5
 560 #define VECTORS 13
 561     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 562     gangs_min = workers_min = vectors_min = INT_MAX;
 563     gangs_max = workers_max = vectors_max = INT_MIN;
 564 #pragma acc kernels \
 565   num_gangs (gangs) \
 566   num_workers (WORKERS) \
 567   vector_length (VECTORS)
 568     {
 569       /* This is to make the OpenACC kernels construct unparallelizable.  */
 570       asm volatile ("" : : : "memory");
 571
 572 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 573       for (int i = 100; i > -100; --i)
 574         {
 575           gangs_min = gangs_max = acc_gang ();
 576           workers_min = workers_max = acc_worker ();
 577           vectors_min = vectors_max = acc_vector ();
 578         }
 579     }
 580     if (gangs_min != 0 || gangs_max != 1 - 1
 581         || workers_min != 0 || workers_max != 1 - 1
 582         || vectors_min != 0 || vectors_max != 1 - 1)
 583       __builtin_abort ();
 584 #undef VECTORS
 585 #undef WORKERS
 586   }
 587
 588
 589   /* OpenACC serial construct.  */
 590
 591   /* GR, WS, VS.  */
 592   {
 593     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 594     gangs_min = workers_min = vectors_min = INT_MAX;
 595     gangs_max = workers_max = vectors_max = INT_MIN;
 596 #pragma acc serial /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
 597   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 598     {
 599       for (int i = 100; i > -100; i--)
 600         {
 601           gangs_min = gangs_max = acc_gang ();
 602           workers_min = workers_max = acc_worker ();
 603           vectors_min = vectors_max = acc_vector ();
 604         }
 605     }
 606     if (gangs_min != 0 || gangs_max != 1 - 1
 607         || workers_min != 0 || workers_max != 1 - 1
 608         || vectors_min != 0 || vectors_max != 1 - 1)
 609       __builtin_abort ();
 610   }
 611
 612   /* Composition of GP, WP, VP.  */
 613   {
 614     int vectors_actual = 1;  /* Implicit 'vector_length (1)' clause.  */
 615     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 616     gangs_min = workers_min = vectors_min = INT_MAX;
 617     gangs_max = workers_max = vectors_max = INT_MIN;
 618 #pragma acc serial copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
 619   copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max)
 620     {
 621       if (acc_on_device (acc_device_nvidia))
 622         {
 623           /* The GCC nvptx back end enforces vector_length (32).  */
 624           /* It's unclear if that's actually permissible here;
 625              <https://github.com/OpenACC/openacc-spec/issues/238> "OpenACC
 626              'serial' construct might not actually be serial".  */
 627           vectors_actual = 32;
 628         }
 629 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 630       for (int i = 100; i > -100; i--)
 631 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 632         for (int j = 100; j > -100; j--)
 633 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 634           for (int k = 100 * vectors_actual; k > -100 * vectors_actual; k--)
 635             {
 636               gangs_min = gangs_max = acc_gang ();
 637               workers_min = workers_max = acc_worker ();
 638               vectors_min = vectors_max = acc_vector ();
 639             }
 640     }
 641     if (acc_get_device_type () == acc_device_nvidia)
 642       {
 643         if (vectors_actual != 32)
 644           __builtin_abort ();
 645       }
 646     else
 647       if (vectors_actual != 1)
 648         __builtin_abort ();
 649     if (gangs_min != 0 || gangs_max != 1 - 1
 650         || workers_min != 0 || workers_max != 1 - 1
 651         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 652       __builtin_abort ();
 653   }
 654
 655
 656   return 0;
 657 }