--- /dev/null
+! { dg-do compile }
+! { dg-options "-cpp -fcoarray=lib" }
+! PR 87397 - this used to generate an ICE.
+
+! Coarray Distributed Transpose Test
+!
+! Copyright (c) 2012-2014, Sourcery, Inc.
+! All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions are met:
+! * Redistributions of source code must retain the above copyright
+! notice, this list of conditions and the following disclaimer.
+! * Redistributions in binary form must reproduce the above copyright
+! notice, this list of conditions and the following disclaimer in the
+! documentation and/or other materials provided with the distribution.
+! * Neither the name of the Sourcery, Inc., nor the
+! names of its contributors may be used to endorse or promote products
+! derived from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+! ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+! WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+! DISCLAIMED. IN NO EVENT SHALL SOURCERY, INC., BE LIABLE FOR ANY
+! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+! ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+!
+! Robodoc header:
+!****m* dist_transpose/run_size
+! NAME
+! run_size
+! SYNOPSIS
+! Encapsulate problem state, wall-clock timer interface, integer broadcasts, and a data copy.
+!******
+!================== test transposes with integer x,y,z values ===============================
+module run_size
+ use iso_fortran_env
+ implicit none
+
+ integer(int64), codimension[*] :: nx, ny, nz
+ integer(int64), codimension[*] :: my, mx, first_y, last_y, first_x, last_x
+ integer(int64) :: my_node, num_nodes
+ real(real64), codimension[*] :: tran_time
+
+
+contains
+
+!****s* run_size/broadcast_int
+! NAME
+! broadcast_int
+! SYNOPSIS
+! Broadcast a scalar coarray integer from image 1 to all other images.
+!******
+ subroutine broadcast_int( variable )
+ integer(int64), codimension[*] :: variable
+ integer(int64) :: i
+ if( my_node == 1 ) then
+ do i = 2, num_nodes; variable[i] = variable; end do
+ end if
+ end subroutine broadcast_int
+
+subroutine copy3( A,B, n1, sA1, sB1, n2, sA2, sB2, n3, sA3, sB3 )
+ implicit none
+ complex, intent(in) :: A(0:*)
+ complex, intent(out) :: B(0:*)
+ integer(int64), intent(in) :: n1, sA1, sB1
+ integer(int64), intent(in) :: n2, sA2, sB2
+ integer(int64), intent(in) :: n3, sA3, sB3
+ integer(int64) i,j,k
+
+ do k=0,n3-1
+ do j=0,n2-1
+ do i=0,n1-1
+ B(i*sB1+j*sB2+k*sB3) = A(i*sA1+j*sA2+k*sA3)
+ end do
+ end do
+ end do
+end subroutine copy3
+
+end module run_size
+
+!****e* dist_transpose/coarray_distributed_transpose
+! NAME
+! coarray_distributed_transpose
+! SYNOPSIS
+! This program tests the transpose routines used in Fourier-spectral simulations of homogeneous turbulence.
+! The data is presented to the physics routines as groups of y-z or x-z planes distributed among the images.
+! The (out-of-place) transpose routines do the x <--> y transposes required and consist of transposes within
+! data blocks (intra-image) and a transpose of the distribution of these blocks among the images (inter-image).
+!
+! Two methods are tested here:
+! RECEIVE: receive block from other image and transpose it
+! SEND: transpose block and send it to other image
+!
+! This code is the coarray analog of mpi_distributed_transpose.
+!******
+
+program coarray_distributed_transpose
+ !(***********************************************************************************************************
+ ! m a i n p r o g r a m
+ !***********************************************************************************************************)
+ use run_size
+ implicit none
+
+ complex, allocatable :: u(:,:,:,:)[:] ! u(nz,4,first_x:last_x,ny)[*] !(*-- ny = my * num_nodes --*)
+ complex, allocatable :: ur(:,:,:,:)[:] !ur(nz,4,first_y:last_y,nx/2)[*] !(*-- nx/2 = mx * num_nodes --*)
+ complex, allocatable :: bufr_X_Y(:,:,:,:)
+ complex, allocatable :: bufr_Y_X(:,:,:,:)
+ integer(int64) :: x, y, z, msg_size, iter
+
+ num_nodes = num_images()
+ my_node = this_image()
+
+ if( my_node == 1 ) then
+ !write(6,*) "nx,ny,nz : "; read(5,*) nx, ny, nz
+ nx=32; ny=32; nz=32
+ call broadcast_int( nx ); call broadcast_int( ny ); call broadcast_int( nz );
+ end if
+ sync all !-- other nodes wait for broadcast!
+
+
+ if ( mod(ny,num_nodes) == 0) then; my = ny / num_nodes
+ else; write(6,*) "node ", my_node, " ny not multiple of num_nodes"; error stop
+ end if
+
+ if ( mod(nx/2,num_nodes) == 0) then; mx = nx/2 / num_nodes
+ else; write(6,*) "node ", my_node, "nx/2 not multiple of num_nodes"; error stop
+ end if
+
+ first_y = (my_node-1)*my + 1; last_y = (my_node-1)*my + my
+ first_x = (my_node-1)*mx + 1; last_x = (my_node-1)*mx + mx
+
+ allocate ( u(nz , 4 , first_x:last_x , ny) [*] ) !(*-- y-z planes --*)
+ allocate ( ur(nz , 4 , first_y:last_y , nx/2)[*] ) !(*-- x-z planes --*)
+ allocate ( bufr_X_Y(nz,4,mx,my) )
+ allocate ( bufr_Y_X(nz,4,my,mx) )
+
+ msg_size = nz*4*mx*my !-- message size (complex data items)
+
+!--------- initialize data u (mx y-z planes per image) ----------
+
+ do x = first_x, last_x
+ do y = 1, ny
+ do z = 1, nz
+ u(z,1,x,y) = x
+ u(z,2,x,y) = y
+ u(z,3,x,y) = z
+ end do
+ end do
+ end do
+
+ tran_time = 0
+ do iter = 1, 2 !--- 2 transform pairs per second-order time step
+
+!--------- transpose data u -> ur (mx y-z planes to my x-z planes per image) --------
+
+ ur = 0
+
+ call transpose_X_Y
+
+!--------- test data ur (my x-z planes per image) ----------
+
+ do x = 1, nx/2
+ do y = first_y, last_y
+ do z = 1, nz
+ if ( real(ur(z,1,y,x)) /= x .or. real(ur(z,2,y,x)) /= y .or. real(ur(z,3,y,x)) /= z )then
+ write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_X_Y failed: image ", my_node &
+ , " X ",real(ur(z,1,y,x)),x, " Y ",real(ur(z,2,y,x)),y, " Z ", real(ur(z,3,y,x)),z
+ stop
+ end if
+ end do
+ end do
+ end do
+
+!--------- transpose data ur -> u (my x-z planes to mx y-z planes per image) --------
+
+ u = 0
+ call transpose_Y_X
+
+!--------- test data u (mx y-z planes per image) ----------
+
+ do x = first_x, last_x
+ do y = 1, ny
+ do z = 1, nz
+ if ( real(u(z,1,x,y)) /= x .or. real(u(z,2,x,y)) /= y .or. real(u(z,3,x,y)) /= z )then
+ write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_Y_X failed: image ", my_node &
+ , " X ",real(u(z,1,x,y)),x, " Y ",real(u(z,2,x,y)),y, " Z ", real(u(z,3,x,y)),z
+ stop
+ end if
+ end do
+ end do
+ end do
+ end do
+
+ sync all
+ if( my_node == 1 ) write(6,fmt="(A,f8.3)") "test passed: tran_time ", tran_time
+
+ deallocate ( bufr_X_Y ); deallocate ( bufr_Y_X )
+
+!========================= end of main executable =============================
+
+contains
+
+!------------- out-of-place transpose data_s --> data_r ----------------------------
+
+ subroutine transpose_X_Y
+
+ use run_size
+ implicit none
+
+ integer(int64) :: i,stage
+ real(real64) :: tmp
+
+ sync all !-- wait for other nodes to finish compute
+ call cpu_time(tmp)
+ tran_time = tran_time - tmp
+
+ call copy3 ( u(1,1,first_x,1+(my_node-1)*my) & !-- intra-node transpose
+ , ur(1,1,first_y,1+(my_node-1)*mx) & !-- no inter-node transpose needed
+ , nz*3, 1_8, 1_8 & !-- note: only 3 of 4 words needed
+ , mx, nz*4, nz*4*my &
+ , my, nz*4*mx, nz*4 )
+
+#define RECEIVE
+#ifdef RECEIVE
+
+ do stage = 1, num_nodes-1
+ i = 1 + mod( my_node-1+stage, num_nodes )
+ bufr_X_Y(:,:,:,:) = u(:,:,:,1+(my_node-1)*my:my_node*my)[i] !-- inter-node transpose to buffer
+ call copy3 ( bufr_X_Y, ur(1,1,first_y,1+(i-1)*mx) & !-- intra-node transpose from buffer
+ , nz*3, 1_8, 1_8 & !-- note: only 3 of 4 words needed
+ , mx, nz*4, nz*4*my &
+ , my, nz*4*mx, nz*4 )
+ end do
+
+#else
+
+ do stage = 1, num_nodes-1
+ i = 1 + mod( my_node-1+stage, num_nodes )
+ call copy3 ( u(1,1,first_x,1+(i-1)*my), bufr_Y_X & !-- intra-node transpose to buffer
+ , nz*3, 1_8, 1_8 &
+ , mx, nz*4, nz*4*my &
+ , my, nz*4*mx, nz*4 )
+ ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i] = bufr_Y_X(:,:,:,:) !-- inter-node transpose from buffer
+ end do
+
+#endif
+
+ sync all !-- wait for other nodes to finish transpose
+ call cpu_time(tmp)
+ tran_time = tran_time + tmp
+
+ end subroutine transpose_X_Y
+
+!------------- out-of-place transpose data_r --> data_s ----------------------------
+
+subroutine transpose_Y_X
+ use run_size
+ implicit none
+
+ integer(int64) :: i, stage
+ real(real64) :: tmp
+
+ sync all !-- wait for other nodes to finish compute
+ call cpu_time(tmp)
+ tran_time = tran_time - tmp
+
+ call copy3 ( ur(1,1,first_y,1+(my_node-1)*mx) & !-- intra-node transpose
+ , u(1,1,first_x,1+(my_node-1)*my) & !-- no inter-node transpose needed
+ , nz*4, 1_8, 1_8 & !-- note: all 4 words needed
+ , my, nz*4, nz*4*mx &
+ , mx, nz*4*my, nz*4 )
+
+#define RECEIVE
+#ifdef RECEIVE
+
+ do stage = 1, num_nodes-1
+ i = 1 + mod( my_node-1+stage, num_nodes )
+ bufr_Y_X(:,:,:,:) = ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i] !-- inter-node transpose to buffer
+ call copy3 ( bufr_Y_X, u(1,1,first_x,1+(i-1)*my) & !-- intra-node transpose from buffer
+ , nz*4, 1_8, 1_8 &
+ , my, nz*4, nz*4*mx &
+ , mx, nz*4*my, nz*4 )
+ end do
+
+#else
+
+ do stage = 1, num_nodes-1
+ i = 1 + mod( my_node-1+stage, num_nodes )
+ call copy3 ( ur(1,1,first_y,1+(i-1)*mx), bufr_X_Y & !-- intra-node transpose from buffer
+ , nz*4, 1_8, 1_8 &
+ , my, nz*4, nz*4*mx &
+ , mx, nz*4*my, nz*4 )
+ u(:,:,:,1+(my_node-1)*my:my_node*my)[i] = bufr_X_Y(:,:,:,:) !-- inter-node transpose from buffer
+ end do
+
+#endif
+
+ sync all !-- wait for other nodes to finish transpose
+ call cpu_time(tmp)
+ tran_time = tran_time + tmp
+
+ end subroutine transpose_Y_X
+
+
+end program coarray_distributed_transpose