c subroutines called from the speed.f program. c These are placed in a different file, to prevent inlining in the c calling program (which blindly repeats the same calculation many times, c which a smart compiler would do just once...). c********************************************************* subroutine mult(c,a,b,nsize) ! c=a*b for each element of a,b, and c. real a(nsize),b(nsize),c(nsize) do i=1,nsize ! c(i)=3.1415d00*a(i) + 55.0d00 c(i)=a(i)*b(i) enddo ! unroll ! do i=1,nsize/2,2 !! c(i)=3.1415d00*a(i) + 55.0d00 ! c(i)=a(i)*b(i) ! c(i+1)=a(i+1)*b(i+1) ! enddo ! do i=2,nsize ! c(i)=c(i-1)*b(i) ! enddo ! do i=2,nsize ! c(i)=5.0*b(i) ! enddo ! do i=1,nsize,2 ! c(i)=a(i)*b(i) ! c(i+1)=a(i+1)*b(i+1) ! enddo return end c********************************************************* subroutine matmult(c,a,b,n) ! c=c+a*b matrix multiplication real a(n,n), b(n,n), c(n,n) do j=1,n do i=1,n do k=1,n c(i,j) = c(i,j) + a(i,k)*b(k,j) enddo enddo enddo return end !********************************************************* subroutine sweep(gnew,g,r,b,nx,nv) ! sweeping algorithm used for implicit convection. real gnew(nx,nv), g(nx,nv), r(nx,nv), b(nx,nv) ! backwards sweep: ! do iv=1,nv ! gnew(nx,iv) = 0.0 ! do ix=nx-1,1,-1 ! gnew(ix,iv) = gnew(ix+1,iv)*r(ix,iv) + b(ix,iv) ! enddo ! enddo ! forwards sweep: do iv=1,nv gnew(1,iv) = 1.0 ! Lahey-Fujitsu compiler is 10-20% faster if the 2 in the next do loop ! command is replaced by 1 (though it then goes outside the gnew array and ! would generate errors, unless ghost values for gnew were properly defined). ! Apparently it uses a slower machine coding if the initial value is ! different than 1. Hopefully other compilers are smarter. ! do ix=2,nx ! do ix=1,nx gnew(ix,iv) = gnew(ix-1,iv)*r(ix,iv) + b(ix,iv) enddo enddo return end !********************************************************* subroutine sweepf(gnew,g,r,b,nx,nv) real gnew(nx,nv), g(nx,nv), r(nx,nv), b(nx,nv) ! slightly faster sweeping algorithm used for implicit convection. ! This eliminates recursion in the inner loop, but at the expense of ! large stride gnew(1,:)=gnew(nx,:) do ix=2,nx do iv=1,nv gnew(ix,iv) = gnew(ix-1,iv)*r(ix,iv) + b(ix,iv) enddo enddo return end !********************************************************* subroutine sweepf2(gt,r,b,nx,nv) real gt(nv,nx), r(nv,nx), b(nv,nx) ! fastest sweeping algorithm used for implicit convection, about twice as ! fast as subroutine sweep above. ! ! By transposing the array, the inner loop is now over the first index ! and so has stride one (which is fastest, as the next memory access is ! likely to be in cache), and simultaneously eliminates recursion in the ! inner loop. ! gt(:,1)=gt(:,nx) gt(:,1)=0.0 do ix=2,nx do iv=1,nv gt(iv,ix) = gt(iv,ix-1)*r(iv,ix) + b(iv,ix) enddo enddo return end c********************************************************* subroutine div(c,a,b,nsize) ! c=a/b for each element of a,b, and c. real a(nsize),b(nsize),c(nsize) do i=1,nsize c(i)=a(i)/b(i) enddo return end c********************************************************* subroutine addmult(c,a,b,d,nsize) ! c=a*b+d for each element of a,b, and c. real a(nsize),b(nsize),c(nsize),d(nsize) do i=1,nsize c(i)=(5.232*a(i)*b(i) + 3.14159*d(i) + 5.7)**2 + 8.3 c c(i)=5.232*a(i) + 3.14159 c c(i)=a(i)*b(i) + d(i) c gwh: Of the 3 variations above, the first is done most effeciently c on (at least some) RISC machines, which are able to pipeline c and parallelize parts of the calculation... c enddo return end c********************************************************* subroutine dotprod(c,a,b,nsize) ! evaluate dotproduct c = sum_i a(i)*b(i) real a(nsize),b(nsize),c c=0 do i=1,nsize c=c+a(i)*b(i) enddo ! no faster, neither is hand unrolling the loop: ! do i=1,nsize/2 ! c=c+a(i)*b(i)+a(i+nsize/2)*b(i+nsize/2) ! enddo return end c********************************************************* subroutine add(c,a,b,nsize) ! c=a+b for each element of a,b, and c. real a(nsize),b(nsize),c(nsize) do i=1,nsize c(i)=a(i)+b(i) enddo return end c********************************************************* subroutine logme(c,a,nsize) real a(nsize),c(nsize) do i=1,nsize c(i)=log(a(i)) enddo return end c********************************************************* subroutine lpsqrt(c,a,nsize) real a(nsize),c(nsize) do i=1,nsize c(i)=sqrt(a(i)) enddo return end c********************************************************* subroutine move(c,a,nsize) ! c=a for each element of a and c. real a(nsize),c(nsize) do i=1,nsize c(i)=a(i) enddo return end