For a summary of the following, see: http://w3.pppl.gov/~hammett/comp/bench03 --------------------------------------------------------------------- lf95 --dbl -O --tpp --wide --nap --nchk --npca --nsav --ntrace --prefetch 2 \ --staticlink --info -o speed speed.f speedsub.o daxpy_g.o \ /usr/local/lff95/lib/libblas.a On petrel002, 1.7 GHz Pentium 4 dual proc 256kB Cache: /proc/cpuinfo: cpu family=15, Intel Xeon CPU 1700 MHz 400 MHz Bus Pentium 4 w/ 400 MHz system bus gives 3.2 GB/s data rate (64-bit bus). Intel releasing 800/533 MHz FSB in Q2-2003. Vector length = 100 did 80 million * ops in 0.2559 cpu secs = 312.6722 MFLOPS did 79 million sweep ops in 0.4238 cpu secs = 186.3964 MFLOPS did 79 million fast sweep ops in 0.1973 cpu secs = 400.4745 MFLOPS did 40 million / ops in 0.8984 cpu secs = 44.5218 MFLOPS did 280 million multiple *+ ops in 0.3125 cpu secs = 896.0000 MFLOPS did 80 million Dot *+ ops in 0.1367 cpu secs = 585.1418 MFLOPS did 40 million logs ops in 2.4844 cpu secs = 16.1006 MFLOPS did 40 million sqrts ops in 0.9062 cpu secs = 44.1379 MFLOPS did 320 million daxpy *+ ops in 0.4668 cpu secs = 685.5228 MFLOPS did 320 million daxpy0 *+ ops in 0.4805 cpu secs = 666.0173 MFLOPS did 320 million daxpy4 *+ ops in 0.4062 cpu secs = 787.6923 MFLOPS did 51 million MatMult ops in 0.2754 cpu secs = 185.1919 MFLOPS Vector length = 10000000 did 80 million * ops in 1.2305 cpu secs = 65.0159 MFLOPS did 79 million sweep ops in 0.4297 cpu secs = 183.8543 MFLOPS did 79 million fast sweep ops in 0.1973 cpu secs = 400.4765 MFLOPS did 40 million / ops in 0.9121 cpu secs = 43.8544 MFLOPS did 280 million multiple *+ ops in 0.7949 cpu secs = 352.2358 MFLOPS did 80 million Dot *+ ops in 0.2637 cpu secs = 303.4084 MFLOPS did 40 million logs ops in 2.5078 cpu secs = 15.9502 MFLOPS did 40 million sqrts ops in 0.9004 cpu secs = 44.4251 MFLOPS did 320 million daxpy *+ ops in 1.8574 cpu secs = 172.2819 MFLOPS did 320 million daxpy0 *+ ops in 1.8438 cpu secs = 173.5593 MFLOPS did 320 million daxpy4 *+ ops in 1.8535 cpu secs = 172.6449 MFLOPS did 79 million MatMult ops in 0.2441 cpu secs = 323.5835 MFLOPS On petrel025, 1.6 GHz AMD Athlon MP 1900+, dual proc 256 kB cache /cpu/procinfo cpu family 6. Dual 266 MHz bus? AMD Athlon XP 2800+ was first to get 333 MHz FSB, Oct. 2002. AMD 3000+ still at 333 MHz FSB, Feb. 2003. Vector length = 100 did 80 million * ops in 0.1445 cpu secs = 553.5145 MFLOPS did 79 million sweep ops in 0.2383 cpu secs = 331.5413 MFLOPS did 79 million fast sweep ops in 0.1367 cpu secs = 577.8275 MFLOPS did 40 million / ops in 0.4492 cpu secs = 89.0434 MFLOPS did 280 million multiple *+ ops in 0.2148 cpu secs = 1303.2773 MFLOPS did 80 million Dot *+ ops in 0.1113 cpu secs = 718.5908 MFLOPS did 40 million logs ops in 3.1113 cpu secs = 12.8562 MFLOPS did 40 million sqrts ops in 0.6074 cpu secs = 65.8521 MFLOPS did 320 million daxpy *+ ops in 0.3281 cpu secs = 975.2381 MFLOPS did 320 million daxpy0 *+ ops in 0.3164 cpu secs = 1011.3588 MFLOPS did 320 million daxpy4 *+ ops in 0.2715 cpu secs = 1178.7067 MFLOPS did 51 million MatMult ops in 0.1797 cpu secs = 283.8253 MFLOPS Vector length = 10000000 did 80 million * ops in 2.6328 cpu secs = 30.3858 MFLOPS did 79 million sweep ops in 0.2363 cpu secs = 334.2812 MFLOPS did 79 million fast sweep ops in 0.1367 cpu secs = 577.8317 MFLOPS did 40 million / ops in 1.3262 cpu secs = 30.1620 MFLOPS did 280 million multiple *+ ops in 1.6133 cpu secs = 173.5592 MFLOPS did 80 million Dot *+ ops in 0.8008 cpu secs = 99.9025 MFLOPS did 40 million logs ops in 3.1367 cpu secs = 12.7522 MFLOPS did 40 million sqrts ops in 1.1289 cpu secs = 35.4325 MFLOPS did 320 million daxpy *+ ops in 4.0781 cpu secs = 78.4674 MFLOPS did 320 million daxpy0 *+ ops in 3.7520 cpu secs = 85.2889 MFLOPS did 320 million daxpy4 *+ ops in 3.8750 cpu secs = 82.5806 MFLOPS did 79 million MatMult ops in 0.4883 cpu secs = 161.7921 MFLOPS ----------------------------------------------------------------------------- ifc -r8 -O3 -tpp7 -axiMKW -pad -unroll -o speed speed.f speedsub.o \ daxpy_g.o /usr/local/lff95/lib/libblas.a On petrel002, 1.7 GHz Pentium 4 dual proc 256kB Cache: /proc/cpuinfo: cpu family=15, Intel Xeon CPU 1700 MHz 400 MHz Bus Pentium 4 w/ 400 MHz system bus gives 3.2 GB/s data rate (64-bit bus). Intel releasing 800/533 MHz FSB in Q2-2003. Vector length = 100 did 40 million * ops in 0.0500 cpu secs = 800.0000 MFLOPS did 79 million sweep ops in 0.3000 cpu secs = 263.3333 MFLOPS did 79 million fast sweep ops in 0.1700 cpu secs = 464.7059 MFLOPS did 40 million / ops in 0.8200 cpu secs = 48.7805 MFLOPS did 280 million multiple *+ ops in 0.1700 cpu secs = 1647.0588 MFLOPS did 80 million Dot *+ ops in 0.0500 cpu secs = 1600.0000 MFLOPS did 40 million logs ops in 1.0600 cpu secs = 37.7358 MFLOPS did 40 million sqrts ops in 0.8200 cpu secs = 48.7805 MFLOPS did 320 million daxpy *+ ops in 0.4100 cpu secs = 780.4878 MFLOPS did 320 million daxpy0 *+ ops in 0.2200 cpu secs = 1454.5455 MFLOPS Vector length = 10000000 did 40 million * ops in 0.6100 cpu secs = 65.5738 MFLOPS did 79 million sweep ops in 0.2900 cpu secs = 272.4138 MFLOPS did 79 million fast sweep ops in 0.1800 cpu secs = 438.8889 MFLOPS did 40 million / ops in 0.8700 cpu secs = 45.9770 MFLOPS did 280 million multiple *+ ops in 0.7900 cpu secs = 354.4304 MFLOPS did 80 million Dot *+ ops in 0.2600 cpu secs = 307.6923 MFLOPS did 40 million logs ops in 1.1000 cpu secs = 36.3636 MFLOPS did 40 million sqrts ops in 0.8700 cpu secs = 45.9770 MFLOPS did 320 million daxpy *+ ops in 1.8500 cpu secs = 172.9730 MFLOPS did 320 million daxpy0 *+ ops in 1.8100 cpu secs = 176.7956 MFLOPS On petrel025, 1.6 GHz AMD Athlon MP 1900+, dual proc 256 kB cache /cpu/procinfo cpu family 6. Dual 266 MHz bus? AMD Athlon XP 2800+ was first to get 333 MHz FSB, Oct. 2002. AMD 3000+ still at 333 MHz FSB, Feb. 2003. Vector length = 100 did 40 million * ops in 0.0800 cpu secs = 500.0000 MFLOPS did 79 million sweep ops in 0.2200 cpu secs = 359.0909 MFLOPS did 79 million fast sweep ops in 0.1500 cpu secs = 526.6667 MFLOPS did 40 million / ops in 0.5600 cpu secs = 71.4286 MFLOPS did 280 million multiple *+ ops in 0.2000 cpu secs = 1400.0000 MFLOPS did 80 million Dot *+ ops in 0.1100 cpu secs = 727.2727 MFLOPS did 40 million logs ops in 2.7200 cpu secs = 14.7059 MFLOPS did 40 million sqrts ops in 0.8200 cpu secs = 48.7805 MFLOPS did 320 million daxpy *+ ops in 0.3200 cpu secs = 1000.0000 MFLOPS did 320 million daxpy0 *+ ops in 0.3800 cpu secs = 842.1053 MFLOPS Vector length = 10000000 did 40 million * ops in 1.2900 cpu secs = 31.0078 MFLOPS did 79 million sweep ops in 0.2200 cpu secs = 359.0909 MFLOPS did 79 million fast sweep ops in 0.1400 cpu secs = 564.2857 MFLOPS did 40 million / ops in 1.4300 cpu secs = 27.9720 MFLOPS did 280 million multiple *+ ops in 1.7400 cpu secs = 160.9195 MFLOPS did 80 million Dot *+ ops in 0.8000 cpu secs = 100.0000 MFLOPS did 40 million logs ops in 2.7800 cpu secs = 14.3885 MFLOPS did 40 million sqrts ops in 0.8900 cpu secs = 44.9438 MFLOPS did 320 million daxpy *+ ops in 4.7400 cpu secs = 67.5105 MFLOPS did 320 million daxpy0 *+ ops in 4.5600 cpu secs = 70.1754 MFLOPS ----------------------------------------------------------------------------- ifc -r8 -O2 -tpp7 -axiMKW -pad -unroll -o speed speed.f speedsub.o \ daxpy_g.o /usr/local/lff95/lib/libblas.a Using -O2 instead of -O3 actually speeds up daxpy0 with vector length 100, thought it slows down the sweep. On petrel002, 1.7 GHz Pentium 4 dual proc 256kB Cache: /proc/cpuinfo: cpu family=15, Intel Xeon CPU 1700 MHz 400 MHz Bus Pentium 4 w/ 400 MHz system bus gives 3.2 GB/s data rate (64-bit bus). Intel releasing 800/533 MHz FSB in Q2-2003. Vector length = 100 did 80 million * ops in 0.1100 cpu secs = 727.2727 MFLOPS did 79 million sweep ops in 0.6100 cpu secs = 129.5082 MFLOPS did 79 million fast sweep ops in 0.1600 cpu secs = 493.7500 MFLOPS did 40 million / ops in 0.8300 cpu secs = 48.1928 MFLOPS did 280 million multiple *+ ops in 0.1700 cpu secs = 1647.0588 MFLOPS did 80 million Dot *+ ops in 0.0500 cpu secs = 1600.0000 MFLOPS did 40 million logs ops in 1.0600 cpu secs = 37.7358 MFLOPS did 40 million sqrts ops in 0.8300 cpu secs = 48.1928 MFLOPS did 320 million daxpy *+ ops in 0.4100 cpu secs = 780.4878 MFLOPS did 320 million daxpy0 *+ ops in 0.2100 cpu secs = 1523.8095 MFLOPS did 320 million daxpy4 *+ ops in 0.4100 cpu secs = 780.4878 MFLOPS did 51 million MatMult ops in 0.1600 cpu secs = 318.7500 MFLOPS Vector length = 10000000 did 80 million * ops in 1.2200 cpu secs = 65.5738 MFLOPS did 79 million sweep ops in 0.6100 cpu secs = 129.5082 MFLOPS did 79 million fast sweep ops in 0.1600 cpu secs = 493.7500 MFLOPS did 40 million / ops in 0.8800 cpu secs = 45.4545 MFLOPS did 280 million multiple *+ ops in 0.7900 cpu secs = 354.4304 MFLOPS did 80 million Dot *+ ops in 0.2500 cpu secs = 320.0000 MFLOPS did 40 million logs ops in 1.1000 cpu secs = 36.3636 MFLOPS did 40 million sqrts ops in 0.8700 cpu secs = 45.9770 MFLOPS did 320 million daxpy *+ ops in 1.8700 cpu secs = 171.1230 MFLOPS did 320 million daxpy0 *+ ops in 1.8200 cpu secs = 175.8242 MFLOPS did 320 million daxpy4 *+ ops in 1.8600 cpu secs = 172.0430 MFLOPS did 79 million MatMult ops in 0.5400 cpu secs = 146.2963 MFLOPS On petrel025, 1.6 GHz AMD Athlon MP 1900+, dual proc 256 kB cache /cpu/procinfo cpu family 6. Dual 266 MHz bus? AMD Athlon XP 2800+ was first to get 333 MHz FSB, Oct. 2002. AMD 3000+ still at 333 MHz FSB, Feb. 2003. Vector length = 100 did 80 million * ops in 0.1200 cpu secs = 666.6667 MFLOPS did 79 million sweep ops in 0.4100 cpu secs = 192.6829 MFLOPS did 79 million fast sweep ops in 0.3800 cpu secs = 207.8947 MFLOPS did 40 million / ops in 0.5300 cpu secs = 75.4717 MFLOPS did 280 million multiple *+ ops in 0.2000 cpu secs = 1400.0000 MFLOPS did 80 million Dot *+ ops in 0.1100 cpu secs = 727.2727 MFLOPS did 40 million logs ops in 2.6000 cpu secs = 15.3846 MFLOPS did 40 million sqrts ops in 0.8200 cpu secs = 48.7805 MFLOPS did 320 million daxpy *+ ops in 0.3200 cpu secs = 1000.0000 MFLOPS did 320 million daxpy0 *+ ops in 0.3100 cpu secs = 1032.2581 MFLOPS did 320 million daxpy4 *+ ops in 0.3000 cpu secs = 1066.6667 MFLOPS did 51 million MatMult ops in 0.1300 cpu secs = 392.3077 MFLOPS Vector length = 10000000 did 80 million * ops in 2.5800 cpu secs = 31.0078 MFLOPS did 79 million sweep ops in 0.4200 cpu secs = 188.0952 MFLOPS did 79 million fast sweep ops in 0.3700 cpu secs = 213.5135 MFLOPS did 40 million / ops in 1.2900 cpu secs = 31.0078 MFLOPS did 280 million multiple *+ ops in 1.7500 cpu secs = 160.0000 MFLOPS did 80 million Dot *+ ops in 0.8200 cpu secs = 97.5610 MFLOPS did 40 million logs ops in 2.6700 cpu secs = 14.9813 MFLOPS did 40 million sqrts ops in 0.9200 cpu secs = 43.4783 MFLOPS did 320 million daxpy *+ ops in 4.7400 cpu secs = 67.5105 MFLOPS did 320 million daxpy0 *+ ops in 4.5600 cpu secs = 70.1754 MFLOPS did 320 million daxpy4 *+ ops in 4.6200 cpu secs = 69.2641 MFLOPS did 79 million MatMult ops in 0.5000 cpu secs = 158.0000 MFLOPS