prl_fft.f90 Source File


This file depends on

sourcefile~~prl_fft.f90~~EfferentGraph sourcefile~prl_fft.f90 prl_fft.f90 sourcefile~parallel.f90 parallel.F90 sourcefile~prl_fft.f90->sourcefile~parallel.f90 sourcefile~modern_fft_lib.f90 modern_fft_lib.F90 sourcefile~prl_fft.f90->sourcefile~modern_fft_lib.f90 sourcefile~mpi_var.f90 mpi_var.f90 sourcefile~parallel.f90->sourcefile~mpi_var.f90 sourcefile~util.f90 util.f90 sourcefile~parallel.f90->sourcefile~util.f90 sourcefile~common_param.f90 common_param.f90 sourcefile~parallel.f90->sourcefile~common_param.f90 sourcefile~precision_def.f90 precision_def.F90 sourcefile~modern_fft_lib.f90->sourcefile~precision_def.f90 sourcefile~mpi_var.f90->sourcefile~precision_def.f90 sourcefile~util.f90->sourcefile~precision_def.f90 sourcefile~code_util.f90 code_util.f90 sourcefile~util.f90->sourcefile~code_util.f90 sourcefile~common_param.f90->sourcefile~precision_def.f90 sourcefile~code_util.f90->sourcefile~precision_def.f90

Files dependent on this one

sourcefile~~prl_fft.f90~~AfferentGraph sourcefile~prl_fft.f90 prl_fft.f90 sourcefile~psolve.f90 psolve.f90 sourcefile~psolve.f90->sourcefile~prl_fft.f90 sourcefile~init_beam_part_distrib.f90 init_beam_part_distrib.f90 sourcefile~init_beam_part_distrib.f90->sourcefile~psolve.f90 sourcefile~pic_out_util.f90 pic_out_util.f90 sourcefile~pic_out_util.f90->sourcefile~psolve.f90 sourcefile~aladyn.f90 ALaDyn.F90 sourcefile~aladyn.f90->sourcefile~init_beam_part_distrib.f90 sourcefile~aladyn.f90->sourcefile~pic_out_util.f90

Contents

Source Code


Source Code

!*****************************************************************************************************!
!                            Copyright 2008-2020  The ALaDyn Collaboration                            !
!*****************************************************************************************************!

!*****************************************************************************************************!
!  This file is part of ALaDyn.                                                                       !
!                                                                                                     !
!  ALaDyn is free software: you can redistribute it and/or modify                                     !
!  it under the terms of the GNU General Public License as published by                               !
!  the Free Software Foundation, either version 3 of the License, or                                  !
!  (at your option) any later version.                                                                !
!                                                                                                     !
!  ALaDyn is distributed in the hope that it will be useful,                                          !
!  but WITHOUT ANY WARRANTY; without even the implied warranty of                                     !
!  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                      !
!  GNU General Public License for more details.                                                       !
!                                                                                                     !
!  You should have received a copy of the GNU General Public License                                  !
!  along with ALaDyn.  If not, see <http://www.gnu.org/licenses/>.                                    !
!*****************************************************************************************************!

 module prl_fft
  use parallel
  use modern_fft_lib
  implicit none

  real(dp), allocatable :: fp1(:, :, :), fp2(:, :, :), faux1(:), &
                           faux2(:)
  integer, allocatable :: loc_yft_ord(:), loc_zft_ord(:)

 contains
  subroutine mpi_ftw_alloc(n1, n2, n2_loc, n3, n3_loc)
   integer, intent(in) :: n1, n2, n2_loc, n3, n3_loc
   integer :: n1_xloc, n2_xloc, lenw
   !=======================
   ! loc_grid allready define

   n1_xloc = n1/npe_yloc
   n2_xloc = n1/npe_zloc
   lenw = n1_xloc*n2_loc*n3_loc
   allocate (faux1(lenw), faux2(lenw))
   allocate (fp1(n1_xloc, n2, n3_loc))
   allocate (fp2(n2_xloc, n2_loc, n3))

  end subroutine
  !==========================
  subroutine mpi_yzft_ord(ny_ft, nz_ft)
   integer, intent(in) :: ny_ft, nz_ft
   integer :: ip, nh

   allocate (loc_yft_ord(0:npe_yloc - 1))
   allocate (loc_zft_ord(0:npe_zloc - 1))
   nh = npe_yloc/2 - 1
   if (npe_yloc > 4) then
    do ip = 0, nh              ![ x  |   |   |   ]
     loc_yft_ord(ip) = 1
    end do
    do ip = nh + 1, nh + 3
     loc_yft_ord(ip) = loc_yft_ord(ip - 1) + ny_ft
    end do
    ![   | x  |   |   ]   ![   |   | x  |   ] ![   |   |   | x  ]
    do ip = nh + 4, npe_yloc - 1
     loc_yft_ord(ip) = loc_yft_ord(nh + 3)
    end do
   endif
   loc_zft_ord(0:npe_zloc - 1) = 1
   if (npe_zloc > 4) then
    do ip = 0, npe_zloc/2 - 1            ![ x  |   |   |   ]
     loc_zft_ord(ip) = 1
    end do
    do ip = npe_zloc/2, npe_zloc/2 + 2
     loc_zft_ord(ip) = loc_zft_ord(ip - 1) + nz_ft
    end do
    ![   | x  |   |   ]   ![   |   | x  |   ] ![   |   |   | x  ]
    do ip = npe_zloc/2 + 3, npe_zloc - 1
     loc_zft_ord(ip) = loc_zft_ord(npe_zloc/2 + 2)
    end do
   endif
  end subroutine

  subroutine mpi_ftw_dalloc

   if (allocated(faux1)) deallocate (faux1)
   if (allocated(faux2)) deallocate (faux2)
   if (allocated(fp1)) deallocate (fp1)
   if (allocated(fp2)) deallocate (fp2)

  end subroutine

  !===================
  subroutine ft_overset_grid(w_s, w_r, nft1, nft2, nft3)
   real(dp), intent(inout) :: w_s(:, :, :)
   real(dp), intent(out) :: w_r(:, :, :)
   integer, intent(in) :: nft1, nft2, nft3
   integer :: lenw, dd, nhy, nhz
   integer :: j1, j2, k1, k2, jl, jr, kl, kr
   integer :: ybd, zbd, kkr, jjr
   integer ::  tag, ip, per, pes
   !=========== gather on w_r data along y-coordinate using 3 neighbors
   ! imody(+1,+2,+3)-> imody for imody < npe_yloc/2-1
   ! imody(-1,+1,+2)-> imody for imody= npe_yloc/2-1
   ! imody(-2,-1,+1)-> imody for imody= npe_yloc/2
   ! imody(-3,-2,-1)-> imody for imody> npe_yloc/2+1
   !======================================================
   lenw = nft1*nft2*nft3
   dd = 1
   nhy = npe_yloc/2 - 1
   nhz = npe_zloc/2 - 1
   jl = min(imody, 3)
   jr = min(nhy + 3 - imody, 3)
   ybd = min(3, npe_yloc - 1 - imody)
   k1 = loc_zft_ord(imodz)
   k2 = k1 + nft3 - 1
   j1 = loc_yft_ord(imody)
   j2 = j1 + nft2 - 1
   w_r(1:nft1, j1:j2, k1:k2) = w_s(1:nft1, 1:nft2, 1:nft3)
!====================
   !imody=[1,nh+3] sends to ipe=imody-1,imody-2,imody-3  => ipe=[0,nh]
   !
!===================
   !sends to next(ip)  recieves from prev(ip)
   if (imody < nhy + 4) then
    if (imody > 0) then
     do ip = 1, jl
      tag = 200 + ip
      pes = yp_prev(ip)
      call mpi_send(w_s(1, 1, 1), lenw, mpi_sd, pes, tag, comm_col(dd), error)
     end do
    endif
    j1 = loc_yft_ord(imody)
    do ip = 1, jr
     per = yp_next(ip)
     tag = 200 + ip
     call mpi_recv(fp1(1, 1, 1), lenw, &
                   mpi_sd, per, tag, comm_col(dd), status, error)
     j1 = j1 + nft2
     j2 = j1 + nft2 - 1
     w_r(1:nft1, j1:j2, k1:k2) = fp1(1:nft1, 1:nft2, 1:nft3)
    end do
   endif
   !sends imody=nh to (nh+1,nh+2,nh+3) up to
   !imody=npe_yloc-3 to npe_yloc-2,npe_yloc-1) ybd=2
   !imody=npe_yloc-2 to npe_yloc-1)            ybd=1
   if (imody > (nhy - 1)) then
    j1 = loc_yft_ord(imody)
    jjr = imody - nhy
    jjr = min(jjr, 3)
    do ip = 1, ybd
     tag = 200 - ip
     pes = yp_next(ip)
     call mpi_send(w_s(1, 1, 1), lenw, mpi_sd, pes, tag, comm_col(dd), error)
    end do
    j1 = loc_yft_ord(imody)
    do ip = 1, jjr
     tag = 200 - ip
     per = yp_prev(ip)
     call mpi_recv(fp1(1, 1, 1), lenw, &
                   mpi_sd, per, tag, comm_col(dd), status, error)
     j1 = j1 - nft2
     j2 = j1 + nft2 - 1
     w_r(1:nft1, j1:j2, k1:k2) = fp1(1:nft1, 1:nft2, 1:nft3)
    end do
   endif
!================== end prly====================
!=============
   if (prlz) then
    zbd = min(3, npe_zloc - 1 - imodz)
    kl = min(imodz, 3)
    kr = min(nhz + 3 - imodz, 3)
    dd = 2
    j1 = loc_yft_ord(imody)
    j2 = j1 + nft2 - 1
!================= imodz sends to  imodz-1,-2,-3
    if (imodz < nhz + 4) then
     if (imodz > 0) then
      do ip = 1, kl
       tag = 100 + ip
       pes = zp_prev(ip)
       call mpi_send(w_s(1, 1, 1), lenw, mpi_sd, pes, tag, comm_col(dd), error)
      end do
     endif
     k1 = loc_zft_ord(imodz)
     do ip = 1, kr
      tag = 100 + ip
      per = zp_next(ip)
      call mpi_recv(fp1(1, 1, 1), lenw, &
                    mpi_sd, per, tag, comm_col(dd), status, error)
      k1 = k1 + nft3
      k2 = k1 + nft3 - 1
      w_r(1:nft1, j1:j2, k1:k2) = fp1(1:nft1, 1:nft2, 1:nft3)
     end do
    endif
    !sends nhz => (nhz+1,nhz+2,nhz+3)      => npe_zloc-2 > npe_zloc-1 (zbd=1)
    if (imodz > (nhz - 1)) then
     kkr = imodz - nhz
     kkr = min(kkr, 3)
     do ip = 1, zbd
      tag = 100 - ip
      pes = zp_next(ip)
      call mpi_send(w_s(1, 1, 1), lenw, mpi_sd, pes, tag, comm_col(dd), error)
     end do
     k1 = loc_zft_ord(imodz)
     do ip = 1, kkr
      per = zp_prev(ip)
      tag = 100 - ip
      call mpi_recv(fp1(1, 1, 1), lenw, &
                    mpi_sd, per, tag, comm_col(dd), status, error)
      k1 = k1 - nft3
      k2 = k1 + nft3 - 1
      w_r(1:nft1, j1:j2, k1:k2) = fp1(1:nft1, 1:nft2, 1:nft3)
     end do
    endif
   endif
  end subroutine

  subroutine swap_yx_3data(waux, wdata, n1_loc, n2, n3)

   integer, intent(in) :: n1_loc, n2, n3
   real(dp), intent(in) :: waux(:, :, :)
   real(dp), intent(out) :: wdata(:, :, :)

   integer :: pes, per, ip
   integer :: i1, j1, lenws, lenwr, tag, n2_xloc, iy, ix, iz
   integer :: kk
   !-----------------
   !From waux(1:n1,1:n2_xloc) to wdata(1:n1_loc,1:n2)
   n2_xloc = n2/npe_xloc

   do iz = 1, n3
    do iy = 1, n2_xloc
     j1 = iy + n2_xloc*imodx
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imodx
      wdata(ix, j1, iz) = waux(i1, iy, iz)
     end do
    end do
   end do
   if (npe_yloc == 1) return
   lenws = n1_loc*n2_xloc*n3
   lenwr = lenws
   if (size(faux1) < lenwr) then
    deallocate (faux1, faux2)
    allocate (faux1(lenws))
    allocate (faux2(lenwr))
   end if

   do ip = 1, npe_xloc - 1
    pes = xp_next(ip)
    i1 = n1_loc*pes
    per = xp_prev(ip)
    tag = 80 + ip
    kk = 0
    do iz = 1, n3
     do iy = 1, n2_xloc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = waux(ix + i1, iy, iz)
      end do
     end do
    end do
    call mpi_sendrecv(faux1(1), lenws, mpi_sd, pes, tag, faux2(1), &
                      lenwr, mpi_sd, per, tag, comm_col(3), status, error)
    j1 = n2_xloc*per
    kk = 0
    do iz = 1, n3
     do iy = 1, n2_xloc
      do ix = 1, n1_loc
       kk = kk + 1
       wdata(ix, iy + j1, iz) = faux2(kk)
      end do
     end do
    end do
   end do

  end subroutine

  subroutine swap_xy_3data(wp1, wp2, n1_loc, n2_loc, n3_loc)

   integer, intent(in) :: n1_loc, n2_loc, n3_loc
   real(dp), intent(in) :: wp1(:, :, :)
   real(dp), intent(out) :: wp2(:, :, :)

   integer :: pes, per, ip
   integer :: i1, j1, lenws, lenwr, tag, iy, ix, iz
   integer :: kk
   !-----------------
   !From wp1(1:n1_loc,1:n2) to wp2(1:n1,1:n2_loc,ic)

   do iz = 1, n3_loc
    do iy = 1, n2_loc
     j1 = iy + n2_loc*imody
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imody
      wp2(i1, iy, iz) = wp1(ix, j1, iz)
     end do
    end do
   end do
   if (npe_yloc == 1) return
   lenws = n1_loc*n2_loc*n3_loc
   lenwr = lenws
   if (size(faux1) < lenwr) then
    deallocate (faux1, faux2)
    allocate (faux1(lenws))
    allocate (faux2(lenwr))
   end if

   do ip = 1, npe_yloc - 1
    pes = yp_next(ip)
    j1 = n2_loc*pes
    per = yp_prev(ip)
    tag = 80 + ip
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = wp1(ix, iy + j1, iz)
      end do
     end do
    end do
    call mpi_sendrecv(faux1(1), lenws, mpi_sd, pes, tag, faux2(1), &
                      lenwr, mpi_sd, per, tag, comm_col(1), status, error)
    i1 = n1_loc*per
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       wp2(ix + i1, iy, iz) = faux2(kk)
      end do
     end do
    end do
   end do

  end subroutine

  subroutine swap_xz_3data(wp1, wp2, n1_loc, n2_loc, n3_loc)

   integer, intent(in) :: n1_loc, n2_loc, n3_loc
   real(dp), intent(in) :: wp1(:, :, :)
   real(dp), intent(out) :: wp2(:, :, :)

   integer :: pes, per, ip
   integer :: i1, lenw, tag, iy, ix, iz, iz1
   integer :: kk
   !-----------------
   !From wp1(1:n1_loc,1:n3) to wp2(1:n1,1:n3_loc)
   do iz = 1, n3_loc
    iz1 = iz + n3_loc*imodz
    do iy = 1, n2_loc
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imodz
      wp2(i1, iy, iz) = wp1(ix, iy, iz1)
     end do
    end do
   end do
   if (npe_zloc == 1) return
   lenw = n1_loc*n2_loc*n3_loc
   if (size(faux1) < lenw) then
    deallocate (faux1, faux2)
    allocate (faux1(lenw))
    allocate (faux2(lenw))
   end if

   do ip = 1, npe_zloc - 1
    pes = zp_next(ip)
    iz1 = n3_loc*pes
    per = zp_prev(ip)
    tag = 80 + ip
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = wp1(ix, iy, iz + iz1)
      end do
     end do
    end do
    call mpi_sendrecv(faux1(1), lenw, mpi_sd, pes, tag, faux2(1), lenw, &
                      mpi_sd, per, tag, comm_col(2), status, error)
    i1 = n1_loc*per
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       wp2(ix + i1, iy, iz) = faux2(kk)
      end do
     end do
    end do
   end do

  end subroutine

  subroutine swap_yx_3data_inv(wdata, waux, n1_loc, n2, n3)

   integer, intent(in) :: n1_loc, n2, n3
   real(dp), intent(in) :: wdata(:, :, :)
   real(dp), intent(out) :: waux(:, :, :)

   integer :: pes, per, ip
   integer :: i1, j1, lenws, lenwr, tag, iy, ix, iz, n2_xloc
   integer :: kk
   !-----------------
   !enters wdata(n1_loc,n2,n3)
   !From wdata(1:n1_loc,1:n2) to waux(1:n1,1:n2_xloc)
   !n2_xloc=n2/npe_xloc

   n2_xloc = n2/npe_xloc
   kk = 0
   do iz = 1, n3
    do iy = 1, n2_xloc
     j1 = iy + n2_xloc*imodx
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imodx
      kk = kk + 1
      waux(i1, iy, iz) = wdata(ix, j1, iz)
     end do
    end do
   end do
   if (npe_xloc == 1) return
   lenwr = n1_loc*n2_xloc*n3
   lenws = lenwr
   if (size(faux1) < lenwr) then
    deallocate (faux1, faux2)
    allocate (faux1(lenws))
    allocate (faux2(lenwr))
   end if
   do ip = 1, npe_xloc - 1
    pes = xp_next(ip)
    j1 = n2_xloc*pes
    per = xp_prev(ip)
    tag = 40 + ip
    kk = 0
    do iz = 1, n3
     do iy = 1, n2_xloc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = wdata(ix, iy + j1, iz)
      end do
     end do
    end do
    lenws = kk
    lenwr = lenws

    call mpi_sendrecv(faux1(1), lenws, mpi_sd, pes, tag, faux2(1), &
                      lenwr, mpi_sd, per, tag, comm_col(3), status, error)
    i1 = n1_loc*per
    kk = 0
    do iz = 1, n3
     do iy = 1, n2_xloc
      do ix = 1, n1_loc
       kk = kk + 1
       waux(ix + i1, iy, iz) = faux2(kk)
      end do
     end do
    end do
   end do
  end subroutine

  subroutine swap_xy_3data_inv(wp2, wp1, n1_loc, n2_loc, n3_loc)

   integer, intent(in) :: n1_loc, n2_loc, n3_loc
   real(dp), intent(in) :: wp2(:, :, :)
   real(dp), intent(out) :: wp1(:, :, :)

   integer :: pes, per, ip
   integer :: i1, j1, lenws, lenwr, tag, iy, ix, iz
   integer :: kk
   !-----------------
   !From (1:n1,1:n2_loc) to (1:n1_loc,1:n2)

   kk = 0
   do iz = 1, n3_loc
    do iy = 1, n2_loc
     j1 = iy + n2_loc*imody
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imody
      kk = kk + 1
      wp1(ix, j1, iz) = wp2(i1, iy, iz)
     end do
    end do
   end do
   if (npe_yloc == 1) return
   lenwr = n1_loc*n2_loc*n3_loc
   lenws = lenwr
   if (size(faux1) < lenwr) then
    deallocate (faux1, faux2)
    allocate (faux1(lenws))
    allocate (faux2(lenwr))
   end if
   do ip = 1, npe_yloc - 1
    pes = yp_next(ip)
    i1 = n1_loc*pes
    per = yp_prev(ip)
    tag = 40 + ip
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = wp2(ix + i1, iy, iz)
      end do
     end do
    end do
    lenws = kk
    lenwr = lenws

    call mpi_sendrecv(faux1(1), lenws, mpi_sd, pes, tag, faux2(1), &
                      lenwr, mpi_sd, per, tag, comm_col(1), status, error)
    j1 = n2_loc*per
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       wp1(ix, iy + j1, iz) = faux2(kk)
      end do
     end do
    end do
   end do
  end subroutine
  !=====================
  subroutine swap_xz_3data_inv(wp2, wp1, n1_loc, n2_loc, n3_loc)

   integer, intent(in) :: n1_loc, n2_loc, n3_loc
   real(dp), intent(in) :: wp2(:, :, :)
   real(dp), intent(out) :: wp1(:, :, :)

   integer :: pes, per, ip
   integer :: i1, lenw, tag, iy, ix, iz, iz1
   integer :: kk, k1
   !-----------------
   !From (1:n1,1:n3_loc) to (1:n1_loc,1:n3)

   do iz = 1, n3_loc
    k1 = iz + imodz*n3_loc
    do iy = 1, n2_loc
     do ix = 1, n1_loc
      i1 = ix + n1_loc*imodz
      wp1(ix, iy, k1) = wp2(i1, iy, iz)
     end do
    end do
   end do
   if (npe_zloc == 1) return
   lenw = n1_loc*n2_loc*n3_loc
   if (size(faux1) < lenw) then
    deallocate (faux1, faux2)
    allocate (faux1(lenw))
    allocate (faux2(lenw))
   end if
   do ip = 1, npe_zloc - 1
    pes = zp_next(ip)
    i1 = n1_loc*pes
    per = zp_prev(ip)
    tag = 40 + ip
    kk = 0
    do iz = 1, n3_loc
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       faux1(kk) = wp2(ix + i1, iy, iz)
      end do
     end do
    end do

    call mpi_sendrecv(faux1(1), lenw, mpi_sd, pes, tag, faux2(1), lenw, &
                      mpi_sd, per, tag, comm_col(2), status, error)
    k1 = n3_loc*per
    kk = 0
    do iz = 1, n3_loc
     iz1 = iz + k1
     do iy = 1, n2_loc
      do ix = 1, n1_loc
       kk = kk + 1
       wp1(ix, iy, iz1) = faux2(kk)
      end do
     end do
    end do
   end do
  end subroutine
  !====================================
  subroutine pftw2d_sc(w, n1, n2, n2_loc, n3, n3_loc, is, sym)
   real(dp), intent(inout) :: w(:, :, :)
   integer, intent(in) :: n1, n2, n2_loc, n3, n3_loc, is, sym
   integer :: n1_loc
   !performs a 2D FFT sin/cosine`on the (y,z) coordinates for a 3D data (x,y,z)
   !sym=1 for a sine transform
   !sym=2 for a cosine transform

   select case (is)
   case (-1) !
    if (n2 <= 2) return
    if (prly) then
     n1_loc = n1/npe_yloc
     call swap_xy_3data_inv(w, fp1, n1_loc, n2_loc, n3_loc)

     call ftw1d_sc(fp1, n1_loc, n2, n3_loc, is, 2, sym)
     call swap_xy_3data(fp1, w, n1_loc, n2_loc, n3_loc)
    else
     call ftw1d_sc(w, n1, n2, n3_loc, is, 2, sym)
    end if
    !=====================
    if (n3 <= 2) return
    if (npe_zloc > 1) then
     n1_loc = n1/npe_zloc
     call swap_xz_3data_inv(w, fp2, n1_loc, n2_loc, n3_loc)
     call ftw1d_sc(fp2, n1_loc, n2_loc, n3, is, 3, sym)
     call swap_xz_3data(fp2, w, n1_loc, n2_loc, n3_loc)
    else
     call ftw1d_sc(w, n1, n2_loc, n3, is, 3, sym)
    end if
    !======== exit w(loc)
   case (1)
    ! enters w(loc)
    !========================
    if (n3 > 1) then
     if (npe_zloc > 1) then
      n1_loc = n1/npe_zloc
      call swap_xz_3data_inv(w, fp2, n1_loc, n2_loc, n3_loc)
      call ftw1d_sc(fp2, n1_loc, n2_loc, n3, is, 3, sym)
      call swap_xz_3data(fp2, w, n1_loc, n2_loc, n3_loc)
     else
      call ftw1d_sc(w, n1, n2_loc, n3, is, 3, sym)
     end if
    end if
    !=================
    if (n2 > 1) then
     if (prly) then
      n1_loc = n1/npe_yloc
      call swap_xy_3data_inv(w, fp1, n1_loc, n2_loc, n3_loc)
      call ftw1d_sc(fp1, n1_loc, n2, n3_loc, is, 2, sym)
      call swap_xy_3data(fp1, w, n1_loc, n2_loc, n3_loc)
     else
      call ftw1d_sc(w, n1, n2, n3_loc, is, 2, sym)
     end if
    end if
   end select
   !===================
   !exit w(loc)
  end subroutine
  !========================
  subroutine pftw3d_sc(w, n1, n2, n2_loc, n3, n3_loc, is, sym)
   real(dp), intent(inout) :: w(:, :, :)
   integer, intent(in) :: n1, n2, n2_loc, n3, n3_loc, is, sym

   select case (is)
   case (-1)
    call ftw1d_sc(w, n1, n2_loc, n3_loc, is, 1, sym)
    call pftw2d_sc(w, n1, n2, n2_loc, n3, n3_loc, is, sym)
   case (1)
    ! enters w(loc)
    call pftw2d_sc(w, n1, n2, n2_loc, n3, n3_loc, is, sym)
    !========================
    call ftw1d_sc(w, n1, n2_loc, n3_loc, is, 1, sym)
   end select
   !===================
   !exit w(loc)
  end subroutine
  !============================
  subroutine pftw2d(w, n1, n2, n2_loc, n3, n3_loc, is)
   real(dp), intent(inout) :: w(:, :, :)
   integer, intent(in) :: n1, n2, n2_loc, n3, n3_loc, is
   integer :: n1_loc, dir

   select case (is)
   case (-1)
    if (n2 <= 2) return
    dir = 2
    if (prly) then
     n1_loc = n1/npe_yloc
     call swap_xy_3data_inv(w, fp1, n1_loc, n2_loc, n3_loc)
     call ftw1d(fp1, n1_loc, n2, n3_loc, is, dir)
     call swap_xy_3data(fp1, w, n1_loc, n2_loc, n3_loc)
    else
     call ftw1d(w, n1, n2, n3_loc, is, dir)
    end if
    !=====================
    if (n3 <= 2) return
    dir = 3
    if (npe_zloc > 1) then
     n1_loc = n1/npe_zloc
     call swap_xz_3data_inv(w, fp2, n1_loc, n2_loc, n3_loc)
     call ftw1d(fp2, n1_loc, n2_loc, n3, is, dir)
     call swap_xz_3data(fp2, w, n1_loc, n2_loc, n3_loc)
    else
     call ftw1d(w, n1, n2_loc, n3, is, dir)
    end if
    !======== exit w(loc)
   case (1)
    ! enters w(loc)
    !========================
    if (n3 > 1) then
     dir = 3
     if (npe_zloc > 1) then
      n1_loc = n1/npe_zloc
      call swap_xz_3data_inv(w, fp2, n1_loc, n2_loc, n3_loc)
      call ftw1d(fp2, n1_loc, n2_loc, n3, is, dir)
      call swap_xz_3data(fp2, w, n1_loc, n2_loc, n3_loc)
     else
      call ftw1d(w, n1, n2_loc, n3, is, dir)
     end if
    end if
    !=================
    if (n2 > 1) then
     dir = 2
     if (prly) then
      n1_loc = n1/npe_yloc
      call swap_xy_3data_inv(w, fp1, n1_loc, n2_loc, n3_loc)
      call ftw1d(fp1, n1_loc, n2, n3_loc, is, dir)
      call swap_xy_3data(fp1, w, n1_loc, n2_loc, n3_loc)
     else
      call ftw1d(w, n1, n2, n3_loc, is, dir)
     end if
    end if
   end select
   !===================
   !exit w(loc)
  end subroutine
  !====================================
  subroutine pftw3d(w, n1, n2, n2_loc, n3, n3_loc, is)
   real(dp), intent(inout) :: w(:, :, :)
   integer, intent(in) :: n1, n2, n2_loc, n3, n3_loc, is

   select case (is)
   case (-1)

    call ftw1d(w, n1, n2_loc, n3_loc, is, 1)

    call pftw2d(w, n1, n2, n2_loc, n3, n3_loc, is)

   case (1)
    call pftw2d(w, n1, n2, n2_loc, n3, n3_loc, is)

    call ftw1d(w, n1, n2_loc, n3_loc, is, 1)
   end select
   !===================
   !exit w(loc)
  end subroutine
 end module