c subroutines called from the speed.f program.
c These are placed in a different file, to prevent inlining in the
c calling program (which blindly repeats the same calculation many times,
c which a smart compiler would do just once...).

c*********************************************************
        subroutine mult(c,a,b,nsize)
! c=a*b for each element of a,b, and c.
        real a(nsize),b(nsize),c(nsize)

        do i=1,nsize
!          c(i)=3.1415d00*a(i) + 55.0d00
           c(i)=a(i)*b(i)
        enddo

! unroll
!        do i=1,nsize/2,2
!!          c(i)=3.1415d00*a(i) + 55.0d00
!           c(i)=a(i)*b(i)
!           c(i+1)=a(i+1)*b(i+1)
!        enddo

!        do i=2,nsize
!           c(i)=c(i-1)*b(i)
!        enddo
!        do i=2,nsize
!           c(i)=5.0*b(i)
!        enddo
!        do i=1,nsize,2
!           c(i)=a(i)*b(i)
!           c(i+1)=a(i+1)*b(i+1)
!        enddo
        return
        end

c*********************************************************
        subroutine matmult(c,a,b,n)
! c=c+a*b matrix multiplication
        real a(n,n), b(n,n), c(n,n)

        do j=1,n
           do i=1,n
              do k=1,n
                 c(i,j) = c(i,j) + a(i,k)*b(k,j)
              enddo
           enddo
        enddo
        return
        end

!*********************************************************
        subroutine sweep(gnew,g,r,b,nx,nv)
! sweeping algorithm used for implicit convection.
        real gnew(nx,nv), g(nx,nv), r(nx,nv), b(nx,nv)

! backwards sweep:
!        do iv=1,nv
!           gnew(nx,iv) = 0.0
!           do ix=nx-1,1,-1
!              gnew(ix,iv) = gnew(ix+1,iv)*r(ix,iv) + b(ix,iv)
!           enddo
!        enddo

! forwards sweep:
        do iv=1,nv
           gnew(1,iv) = 1.0

! Lahey-Fujitsu compiler is 10-20% faster if the 2 in the next do loop
! command is replaced by 1 (though it then goes outside the gnew array and 
! would generate errors, unless ghost values for gnew were properly defined).
! Apparently it uses a slower machine coding if the initial value is 
! different than 1.  Hopefully other compilers are smarter. 
!
           do ix=2,nx
!           do ix=1,nx
              gnew(ix,iv) = gnew(ix-1,iv)*r(ix,iv) + b(ix,iv)
           enddo
        enddo

        return
        end
!*********************************************************
        subroutine sweepf(gnew,g,r,b,nx,nv)
        real gnew(nx,nv), g(nx,nv), r(nx,nv), b(nx,nv)

! slightly faster sweeping algorithm used for implicit convection.
! This eliminates recursion in the inner loop, but at the expense of 
! large stride 
        gnew(1,:)=gnew(nx,:)
        do ix=2,nx
           do iv=1,nv
              gnew(ix,iv) = gnew(ix-1,iv)*r(ix,iv) + b(ix,iv)
           enddo
        enddo

        return
        end
!*********************************************************
        subroutine sweepf2(gt,r,b,nx,nv)
        real gt(nv,nx), r(nv,nx), b(nv,nx)

! fastest sweeping algorithm used for implicit convection, about twice as
! fast as subroutine sweep above.
!
! By transposing the array, the inner loop is now over the first index 
! and so has stride one (which is fastest, as the next memory access is 
! likely to be in cache), and simultaneously eliminates recursion in the 
! inner loop.

!        gt(:,1)=gt(:,nx)
        gt(:,1)=0.0
        do ix=2,nx
           do iv=1,nv
              gt(iv,ix) = gt(iv,ix-1)*r(iv,ix) + b(iv,ix)
           enddo
        enddo

        return
        end
c*********************************************************
        subroutine div(c,a,b,nsize)
! c=a/b for each element of a,b, and c.
        real a(nsize),b(nsize),c(nsize)

        do i=1,nsize
           c(i)=a(i)/b(i)
        enddo
        return
        end
c*********************************************************
        subroutine addmult(c,a,b,d,nsize)
! c=a*b+d for each element of a,b, and c.
        real a(nsize),b(nsize),c(nsize),d(nsize)

        do i=1,nsize
           c(i)=(5.232*a(i)*b(i) + 3.14159*d(i) + 5.7)**2 + 8.3
c           c(i)=5.232*a(i) + 3.14159
c           c(i)=a(i)*b(i) + d(i)

c gwh:  Of the 3 variations above, the first is done most effeciently
c on (at least some) RISC machines, which are able to pipeline
c and parallelize parts of the calculation...
c

        enddo
        return
        end
c*********************************************************
        subroutine dotprod(c,a,b,nsize)
! evaluate dotproduct c = sum_i a(i)*b(i)
        real a(nsize),b(nsize),c

        c=0
        do i=1,nsize
           c=c+a(i)*b(i)
        enddo
! no faster, neither is hand unrolling the loop:
!        do i=1,nsize/2
!           c=c+a(i)*b(i)+a(i+nsize/2)*b(i+nsize/2)
!        enddo
        return
        end
c*********************************************************
        subroutine add(c,a,b,nsize)
! c=a+b for each element of a,b, and c.
        real a(nsize),b(nsize),c(nsize)

        do i=1,nsize
           c(i)=a(i)+b(i)
        enddo
        return
        end
c*********************************************************
        subroutine logme(c,a,nsize)
        real a(nsize),c(nsize)

        do i=1,nsize
           c(i)=log(a(i))
        enddo
        return
        end
c*********************************************************
        subroutine lpsqrt(c,a,nsize)
        real a(nsize),c(nsize)

        do i=1,nsize
           c(i)=sqrt(a(i))
        enddo
        return
        end
c*********************************************************
        subroutine move(c,a,nsize)
! c=a for each element of a and c.
        real a(nsize),c(nsize)

        do i=1,nsize
           c(i)=a(i)
        enddo
        return
        end