subroutine xmcfft(isign, n, m, scale, x, inc1x, inc2x, y,
     >     inc1y, inc2y, table, ntable, work, nwork)
c     
c     multitasking complex-to-complex FFT's.
c     A more efficient replacement for CRAY's MCFFT, by
c     Greg Hammett, hammett@princeton.edu,  6-Oct-1993
c     Princeton University Plasma Physics Lab
c     (with some help from Gary Kerbel, NERSC, gdk@kerbel.nersc.gov)
c
c     This is plug-compatible with mcfft, so type "man mcfft" for help.
c
c     define arguments:
      implicit none
      integer isign,n,m,inc1x,inc2x,inc1y,inc2y,ntable,nwork
      real x(2,inc2x,m)
      real y(2,inc2y,m)
      real work(4*n*m),scale
      integer table(ntable)      !part of table is real, and part integer

c     define local variables : (space before : to fool emacs...)
      integer maxcpus
      parameter (maxcpus=16)  ! max # of parallel CPUs
      integer j,k,ipar,ncpus,nucpus,getenv,status
      integer firstcall, mstart(maxcpus), mdo(maxcpus)
      external getenv
      character*20 string
      real veclen

      save ncpus, firstcall
      data ncpus /-1/
      data firstcall /1/

c     check that inputs are reasonable:
      if(ntable .lt. (100+2*n)) then
         write(6,*) "Error in xmcfft: ntable too small !"
         stop
      endif

c     initialize trig factors if necessary:
      if(isign .eq. 0) then
         call cftfax(n,table(11),table(31))
         table(4)=n
         firstcall=0
         return
      endif

c     reinitialize trig factors if necessary:
      if(table(4) .ne. n .or. firstcall .eq. 1)then
         call cftfax(n,table(11),table(31))
         table(4)=n
         firstcall=0
      endif

      if(nwork .lt. 4*n*m) then
         write(6,*) "Error in xmcfft: nwork too small !"
         stop
      endif

c get the NCPUS environment variable, the max # of CPUS to try to use...
      if(ncpus .le. 0)then
         string=' '
         status=getenv('NCPUS',string)
         ncpus=1
         if(status .ne. 0) then
            read(string,'(i10)') ncpus
         endif
         if(ncpus .le. 0) ncpus=1
      endif

c    
c figure out NUCPUS, the # of parallel CPUs to actually try to use:
c     
c     we are vectorizing and parallelizing in the same dimension (the
c     second dimension of the arrays), which has a length of m.
c     So m=veclen*nucpus, where veclen is the vector length, which 
c     needs to be greater than 16 for efficiency.
c
      nucpus=max(m/16,1)   ! want vector length of at least 16
      nucpus=min(nucpus,ncpus)
      veclen = float(m)/float(nucpus)  ! average vector length
      do ipar=1,nucpus
         mstart(ipar)=1+nint((ipar-1)*veclen)
      enddo
      do ipar=1,nucpus-1
         mdo(ipar)=mstart(ipar+1)-mstart(ipar)
      enddo
      mdo(nucpus)=m+1-mstart(ipar)
c      write(6,*) 'mdo =',mdo

c     copy to output vector, and scale (if necessary?)...
      if (scale .ne. 1.0) then
      do j=1,n
         do k=1,m
            y(1,1+(j-1)*inc1y,k) = scale*x(1,1+(j-1)*inc1x,k)
            y(2,1+(j-1)*inc1y,k) = scale*x(2,1+(j-1)*inc1x,k)
         enddo
      enddo
      else
c don't bother copying the array x to y if they share the
c same storage space.
      if(loc(y(1,1,1)) .ne. loc(x(1,1,1)) .or.
     &        inc1y .ne. inc1x .or. inc2y .ne. inc2x) then
      do j=1,n
         do k=1,m
            y(1,1+(j-1)*inc1y,k) = x(1,1+(j-1)*inc1x,k)
            y(2,1+(j-1)*inc1y,k) = x(2,1+(j-1)*inc1x,k)
         enddo
      enddo
      endif

      endif

c The cncall compiler directive below for the CRAY preprocessor fpp tells
c     it that all subroutines in the loop are parallel (i.e., that there 
c     are no conflicts in trying to write to the same data...)

c     cfpp$ permutation (mstart)     ! irrelelvant?
cfpp$ cncall
      do ipar=1,nucpus
         call cfftmlt(y(1,1,mstart(ipar)),
     >                y(2,1,mstart(ipar)),
     >        work(1+4*n*(mstart(ipar)-1)),table(31),table(11),
     >        2*inc1y,2*inc2y,n,mdo(ipar),isign)
      enddo

      return
      end