#!/usr/local/bin/bash ####################################################################### # SHELL PORTABILITY ####################################################################### # Should run with Bash 3.0 or 2.05b, and likely other bash versions as well. # # ksh doesn't grok bash arrays, so "#!/usr/bin/env ksh" is a no-go # # I'm guessing most to all Bourne shells will barf on this, since even # ksh doesn't like it ####################################################################### # KNOWN BUGS ####################################################################### # 1) If a write times out, then the corresponding read will be skipped - # but it may prove that the corresponding read would've been a good # number # 2) No effort is made at all to ensure that the mount options requested # are the ones that actually end up in effect. If soemthing strange # happens on the mount, then something strange will happen on the # performance test # 3) A lot of terminal emulators, including mrxvt and gnome-terminal, # appear to have problems with programs like reblock that output many # many lines. The problem may or may not be due to the use of # carriage returns again and again. konsole -might- not have the # problem, but it's just not as convenient as mrxvt IMO ####################################################################### # TUNABLEs ####################################################################### # (some of these should become command line options someday) # # "$remote" should be host:path on the NFS server # # "$transaction_maxsecs" should be the number of seconds to wait for a # mount, or a data transfer or a umount # # "$RESULTDIR" should be the directory to put results of the tests in # # "$nummeg" should be the number of megabytes to read or write for # performance testing. You need to be able to read or write roughly # "$nummeg" megabytes of data in "$transaction_maxsecs" in the good # cases. If you cannot write this much that fast, then the write and # read will be ignored. If you cannot read this much that fast, the # read will be ignored. # # The "verify_mount" function can verify that your filesystem is mounted # with the correct rsize and wsize, at least on AIX 5.1 (NFS client). # You may want to adapt it to your *ix variant, or you can just change # the case statement to always return true. # # The "verify_network_quiesence" function can tell when your NIC has # gone (mostly) silent. You may want to adapt it to run on your system # (mostly just a matter of changing the interface name on the -i option # to tcpdump to a/the name seen in ifconfig -a), or as with # verify_mount, just change the case statement to always return true. #et -x set -u # with these options: # bg,hard,intr,rsize=8192,wsize=8192 # we got 50Mbps over NFS to ext3 from esmf04d to esmft2 # I suspect UDP may be faster; we're using tcp here. Not sure why AIX # isn't reporting that. function usage { { echo "Usage: $0 [-g] [-s]" echo '-h says to give usage help' echo '-g says to generate results' echo '-s says to summarize the results, as they are generated' # echo '-r says to rank the results' exit "$1" } 1>&2 } generate=0 summarize=0 rank=0 while [ "$#" -gt 0 ] do if [ "$1" = "-g" ] then generate=1 shift elif [ "$1" = "-s" ] then summarize=1 shift # elif [ "$1" = "-r" ] # then # rank=1 # shift elif [ "$1" = "-h" ] then shift usage 0 else usage 1 fi done if [ "$[$generate+$summarize+$rank]" != 1 ] then echo "$0: Must specify exactly one of -g or -s" 1>&2 usage 1 fi # the filesystem to mount, test, umount again and again: remote=esmft1d:/qfs1/thestuff #remote=esmft1d:/nfs_test #remote=seki.nac.uci.edu:/sdb1/foo transaction_maxsecs=$[60*15] export PATH=/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin:$HOME/bin:/usr/local/bin case "`uname -n`" in @esmf04m) # pick up some 64 bit executables export PATH=/u/strombrg/src/fileutils/fileutils-4.1/src:$PATH type -all ls type -all rm ;; esac function get_mount_point { #export NTTMPDIR=/mnt2/double-up/qfs+nfs+test1 dirno=1 while : do NTTMPDIR=/mnt2/qfs+nfs+test${dirno} if [ -d $NTTMPDIR ] then dirno=$[$dirno+1] else break fi done echo Using NTTMPDIR of $NTTMPDIR 1>&2 if ! mkdir -p "$NTTMPDIR" then echo Erroring creating directory "$NTTMPDIR" 1>&2 exit 1 fi echo "$NTTMPDIR" } NTTMPDIR="$(get_mount_point)" if [ "$TMPDIR" = "" ] then TMPDIR=/tmp fi export RESULTDIR="$TMPDIR/NFS-TEST-RESULTS" mkdir -p "$RESULTDIR" function ranking1 { # this is my first attempt at a function that will rank (6,6) higher # than (10,2) or (2,10) # function returns avg(readtime,writetime) + abs(readtime-writetime) if [ "$1" = "" ] then readtime="0" else readtime="$1" fi if [ "$2" = "" ] then writetime="0" else writetime="$2" fi sum="$( ( echo scale=2; echo $readtime + $writetime ) | bc)" # echo sum is $sum 1>&2 average="$( ( echo scale=3; echo $sum / 2 ) | bc)" # echo average is $average 1>&2 abssum="$( ( echo scale=2; echo $readtime - $writetime ) | bc | sed 's/^-//')" # echo abssum is $abssum 1>&2 result="$( ( echo scale=2; echo $average + $abssum ) | bc)" # echo result is $result 1>&2 echo "$result" } function numeric { string="$1" if [ "$string" = "" ] then return 1 fi if [ "$(echo $string | sed 's/[0-9\.]*//')" = "" ] then return 0 fi return 1 } if [ "$summarize" != 0 ] then if ! cd "$RESULTDIR" then echo Sorry, failed to cd to "$RESULDIR" 1>&2 exit fi for prog in grep python clear grep mtee wc cut modtime highest cat egrep awk sleep do # note that we're using type, because which didn't give a useful exit # status if type "$prog" > /dev/null 2>&1 then : Good, we have it else echo Sorry, you will need "$prog" on your '$PATH' for $0 to work 1>&2 echo properly.... 1>&2 exit 1 fi done echo Good, you appear to have all the required programs on your '$PATH'. 1>&2 while : do clear pwd echo for i in Writing Reading do echo "======> $i in isolation (read protocol!=write protocol, read version!=write version, rsize!=wsize)" egrep . $(ls *$i*) /dev/null | \ egrep -vi "timed out|failed" | \ mtee \ 'echo Number of measurements: $(wc -l)' \ 'echo Average number of seconds: $(cut -d " " -f 4 | avg -i)' \ 'echo Average time: $(cut -d " " -f 4 | avg -i | modtime -i)' \ 'sleep 1; echo Best time: $(cut -d " " -f 4 | highest -s $(expr 1024 \* 1024) -r -n 1 | modtime)' \ 'sleep 2; echo Best numbers:; highest -s $(expr 1024 \* 1024) -r -f 2 -n 5' echo done # get all size's, versions and protocols actually measured # xfer-result-Writing-16384-3-udp # xfer-result-Reading-16384-3-tcp found_sizes="$(ls -f | egrep 'xfer-result-Reading|xfer-result-Writing' | sed 's/xfer-result-[A-Za-z]*-\([0-9]*\)-.*$/\1/' | sort -n | uniq)" found_versions="$(ls -f | egrep 'xfer-result-Reading|xfer-result-Writing' | sed 's/xfer-result-[A-Za-z]*-[0-9]*-\([0-9]*\)-.*$/\1/' | sort -n | uniq)" found_protocols="$(ls -f | egrep 'xfer-result-Reading|xfer-result-Writing' | sed 's/xfer-result-[A-Za-z]*-[0-9]*-[0-9]*-\([a-zA-Z]*\).*$/\1/' | sort | uniq)" # echo found_sizes are $found_sizes # echo found_versions are $found_versions # echo found_protocols are $found_protocols # Initially, we'll try consistent versions and protocols, but not sizes - IE, we need to use the same version and protocol # when mounting, but we're going to try letting rsize != wsize. # echo 4 4 $(ranking1 4 4) # echo 5 5 $(ranking1 5 5) # echo 6 6 $(ranking1 6 6) # echo 10 2 $(ranking1 10 2) # echo 9 3 $(ranking1 9 3) # echo 3 9 $(ranking1 3 9) echo "======> Best composite of read and write (read protocol==write protocol, read version==write version, rsize!=wsize)" for p in $found_protocols do for v in $found_versions do # hmmm... so this is actually proportionate to the number of rsize's * the number of wsize's. This could # really take a long time, if we check a lot of sizes for rs in $found_sizes do for ws in $found_sizes do if cat "xfer-result-Reading-$rs-$v-$p" > /dev/null 2>&1 && cat "xfer-result-Writing-$ws-$v-$p" > /dev/null 2>&1 then readtime=$(awk ' { print $3 } ' < "xfer-result-Reading-$rs-$v-$p") writetime=$(awk ' { print $3 } ' < "xfer-result-Writing-$ws-$v-$p") if numeric "$readtime" && numeric "$writetime" then echo "$p $v rsize: $rs readtime: $readtime wsize: $ws writetime: $writetime composite: $(ranking1 $readtime $writetime)" fi fi done done | \ highest -r -n 5 -f 11 -s 999999 echo '/\/\/\' done done echo echo "======> Best composite of read and write (read protocol==write protocol, read version==write version, rsize==wsize)" for p in $found_protocols do for v in $found_versions do for s in $found_sizes do if cat "xfer-result-Reading-$s-$v-$p" > /dev/null 2>&1 && cat "xfer-result-Writing-$s-$v-$p" > /dev/null 2>&1 then readtime=$(awk ' { print $3 } ' < "xfer-result-Reading-$s-$v-$p") writetime=$(awk ' { print $3 } ' < "xfer-result-Writing-$s-$v-$p") if numeric "$readtime" && numeric "$writetime" then echo "$p $v $s both sizes: $s readtime: $readtime writetime: $writetime composite: $(ranking1 $readtime $writetime)" fi fi done done done | \ highest -r -n 5 -f 11 -s 999999 echo sleep 30 done # we never actually reach this since the above is an infinite loop... exit 0 fi # if we don't do the summarization endless loop, then we fall through to this generation code for prog in rm mkdir dd reblock mount umount egrep touch python do # note that we're using type, because which didn't give a useful exit # status if type "$prog" > /dev/null 2>&1 then : Good, we have it else echo Sorry, you will need "$prog" on your '$PATH' for $0 to work 1>&2 echo properly.... 1>&2 exit 1 fi done echo Good, you appear to have all the required programs on your '$PATH'. 1>&2 export testfn="$NTTMPDIR/testfile" # If you set $nummeg to a number over 2047, then NFSv2 likely won't work # due to being limited to 2 gigabyte files! It appears that NFSv2 will # allow you to create a file > 2 gigabytes, but then you cannot rm it, > # it, nor fopen(,"w") it. #export nummeg=2047 # IE, 2 gigabytes, likely the largest NFSv2 safe value #export nummeg=1536 # IE, 1.5 gigabytes #export nummeg=$[1024*64] # IE, 64 gigabytes - should be enough to invalidate the buffer cache... #export nummeg=$[1024*4] # IE, 4 gigabytes #export nummeg=$[1024*64] # IE, 64 gigabytes - should be enough to invalidate the buffer cache... export nummeg=$[1024*16] # IE, 64 gigabytes - should be enough to invalidate the buffer cache... #export nummeg=768 # with this setting, performance numbers were poorly correlated with NFS transfer size #export nummeg=256 #export nummeg=16 #export datasource=/dev/urandom export datasource=/dev/zero # pertains to $both - number of array elements per "record", since bash # doesn't do 2D arrays export reclen=4 # this'll be iterated over twice. Cannot export bash arrays! both=(Writing 'Write time' "$datasource" "$testfn" Reading 'Read time' "$testfn" /dev/null) if ! mkdir -p "$NTTMPDIR" then echo Sorry, mkdir failed 1>&2 exit 1 fi if ! mkdir -p "$RESULTDIR" then echo Sorry, mkdir failed 1>&2 exit 1 fi function umt { # the cd helps ensure the filesystem won't be "busy". Do -not- use # umount -f - it may fail to flush your buffers correctly for the # purposes of this test cd / set -x maxtime "$transaction_maxsecs" umount "$NTTMPDIR" retval="$?" set +x case "$retval" in 0) return 0 ;; 1) echo umount of $remote failed 1>&2 return 1 ;; 254) echo umount of $remote timed out 1>&2 return 254 ;; *) echo umount of $remote returned a weird value 1>&2 return 254 ;; esac } function verify_mount { # in practice/for now, these will be the same value, but it'll be easier to # make them independent later... rsize="$1" wsize="$2" # first just show the user what's up in terms of mount options nfsstat -m | /usr/local/bin/grep -A 10 "$NTTMPDIR" | sed '/^$/q' # it's not the end of the world if this function does nothing but # report, but it can help you catch errors earlier (if at all, # actually). If your nfsstat doesn't behave this way (like on AIX 5.1), # don't stress about it, just use case 2 case 1 in 1) # the format we expect from nfsstat - but yours may be # different!: # /mnt2/qfs+nfs+test17 from /mnt2/qfs+nfs+test17:esmft1d # Flags: vers=3,proto=tcp,auth=unix,hard,intr,link,symlink,rsize=16384,wsize=16384,retrans=5 # All: srtt=0 (0ms), dev=0 (0ms), cur=0 (0ms) real_rsize="$(nfsstat -m | \ /usr/local/bin/grep -A 10 "$NTTMPDIR" | \ sed '/^$/q' | \ grep rsize | \ sed 's/^.*rsize=\([0-9][0-9]*\),.*$/\1/')" real_wsize="$(nfsstat -m | \ /usr/local/bin/grep -A 10 "$NTTMPDIR" | \ sed '/^$/q' | \ grep wsize | \ sed 's/^.*wsize=\([0-9][0-9]*\),.*$/\1/')" retval=0 if [ "$real_rsize" != "$rsize" ] then echo Bummer, rsize is "$real_rsize", but should be "$rsize" 1>&2 retval=1 fi if [ "$real_wsize" != "$wsize" ] then echo Bummer, wsize is "$real_wsize", but should be "$wsize" 1>&2 retval=1 fi return "$retval" ;; 2) # just return true return 0 ;; esac } function verify_network_quiesence { case 1 in 1) # actually check the network - but you'll need to specify the # right network interface, EG "en2" on AIX 5.1, eth2 on # Solaris, etcetera for i in 1 2 3 4 5 6 7 8 9 0 do # you could also use tethereal, which should support # identical options, I believe if [ $(maxtime 30 tcpdump -c 100 -i en2 host esmft1d | wc -l) -lt 3 ] then # network is quiet, return true return 0 fi sleep 30 done # network did not quiet down, return false return 0 ;; 2) # just return true without verifying anything return 0 ;; esac } function mt { #if ! maxtime $[60*30] mount -o vers="$vers",proto="$proto",rsize="$size",wsize="$size" $remote "$NTTMPDIR" set -x hostname="$(echo $remote | sed 's/:.*$//')" if verify_network_quiesence then echo Good, network is quiet, proceeding 1>&2 else echo Sorry, the network seems kind of busy, so it will not be that good for benchmarking 1>&2 echo Exiting prematurely 1>&2 exit 1 fi maxtime "$transaction_maxsecs" mount -o vers="$vers",proto="$proto",rsize="$size",wsize="$size" $remote "$NTTMPDIR" retval="$?" set +x case "$retval" in 0) if verify_mount "$size" "$size" then echo echo Mount verified OK 1>&2 return 0 else echo Mount failed to verify 1>&2 return 1 fi ;; 1) echo mount of $remote failed 1>&2 exit 1 ;; 254) echo mount of $remote timed out 1>&2 return 254 ;; *) echo mount of $remote returned a weird value 1>&2 return 254 ;; esac } function xfer { srcfile="$1" dstfile="$2" echo From $(ls -l "$srcfile") if [ -f "$dstfile" ] then rm -f "$dstfile" fi # dang 32 bit bash I guess > "$dstfile" df "$dstfile" #truncate "$dstfile" echo To' ' | tr -d '\012' if ! ls -l "$dstfile" then echo ls "$dstfile" failed under implausible circumstances 1>&2 return 1 fi # here we're using reblock in two different ways. The first gives a # running tally of the data transferred as the user watches this # script being run. The second just outputs summary information, for # the benefit of a subsequent report for loop (typed manually by the # enduser :) if cd "$NTTMPDIR" then if [ -f "$dstfile" ] then rm -f "$dstfile" fi > "$dstfile" #truncate "$dstfile" # if it takes more than 30 minutes, give up - that's too long to # be worth pursuing. We have lots of results that are far shorter # than that already set -x maxtime "$transaction_maxsecs" dd if="$srcfile" bs=1024k count=$[$nummeg] 2> /dev/null | \ reblock -e $[1024*$nummeg] $[1024*1024] 300 > "$dstfile" #retval="$?" # this gets the exit status from reblock, not from maxtime! # should use value #1, even though dd should be a child of maxtime # - maxtime is 0, and the dd doesn't get an element in the array copystatus=${PIPESTATUS[@]} echo "${copystatus[@]}" retval="$(echo $copystatus | awk ' { print $1 }')" # echo "${copystatus[0]}" # echo "${copystatus[1]}" # retval="${copystatus[0]}" set +x case "$retval" in 0) return 0 ;; 1) echo transfer of $remote failed 1>&2 exit 1 ;; 254) echo transfer of $remote timed out 1>&2 return 254 ;; *) echo transfer of $remote returned a weird value 1>&2 return 254 ;; esac else echo "cd $NTTMPDIR failed" 1>&2 return 1 fi } function gettime { # this may need to be rewritten in python someday... python -c 'import time; print time.time()' } # just in case - but we won't always need this really. Intentionally # ignore the return value umt # if we iterate by one, this'll take FOREVER. So let's try 512 :) # the process got wedged on 16384 udp 3 from esmf04 to esmft1! # for size in $(seq 1024 256 65536) # for size in $(seq 512 512 65536) # for size in $(seq 4096 512 65536) # the size 16384 with AIX 5.1 NFS client and Solaris 9 NFS server seems # to have a high frequency of problems. nfsmnthelp on the AIX side gets # stuck on mount's or umount's # in normal operation, this should not be commented out. Only comment # it if an nfs-test -g run exits prematurely or gets stuck, and you want to # restart nfs-test, adding new results to the old results rm "$RESULTDIR"/* if ! touch "$RESULTDIR"/simple-touch then echo "Simple touch test failed" 1>&2 exit 1 fi #for size in $(seq 4096 1024 65536) for size in $(seq 4096 1024 65536) do for proto in tcp udp do # vers==2 likely cannot do files over 2 gigabytes! for vers in 3 do #resultfn="$RESULTDIR/xfer-result-$vers-$proto-$size" numelem=$[${#both[@]}/${reclen}] #echo "numelem is $numelem" echo This remove can take a while... Please wait... rm -f "$testfn" echo Remove completed... # lucky 7 :) for i in `seq 7` do echo done #rm -f "$resultfn" # write, then read for index in $(seq 0 $[${numelem}-1]) do breakout=0 base=$[${index}*${reclen}] resultfn="$RESULTDIR/xfer-result-${both[${base}]}-$size-$vers-$proto" rm -f "$resultfn" > "$resultfn" #truncate "$resultfn" ls -l "$resultfn" #starttime=$(python -c 'import time; print time.time()') starttime=$(gettime) mt retval="$?" case "$retval" in 0) ;; 1) echo Sorry, mount failed, skipping this one 1>&2 echo Sorry, mount failed, skipping this one > "$resultfn" # it appears that if AIX 5.1 (as an NFS client) times out mounting or umounting an NFS mount, then subsequent # mounts and/or umounts on the same mount point will all fail... So get a new mount point! NTTMPDIR="$(get_mount_point)" breakout=1 ;; 254) echo Sorry, mount timed out, skipping this one 1>&2 echo Sorry, mount timed out, skipping this one > "$resultfn" # it appears that if AIX 5.1 (as an NFS client) times out mounting or umounting an NFS mount, then subsequent # mounts and/or umounts on the same mount point will all fail... So get a new mount point! NTTMPDIR="$(get_mount_point)" breakout=1 ;; *) echo Sorry, mount did not work in a strange way, skipping this one 1>&2 echo Sorry, mount did not work in a strange way, skipping this one > "$resultfn" # it appears that if AIX 5.1 (as an NFS client) times out mounting or umounting an NFS mount, then subsequent # mounts and/or umounts on the same mount point will all fail... So get a new mount point! NTTMPDIR="$(get_mount_point)" breakout=1 ;; esac if [ "$breakout" = 0 ] then # transfer $nummeg kilobytes #echo index is "$index", base is "$base" echo echo "${both[${base}+0]} $nummeg megabytes: $size $vers $proto" xfer "${both[${base}+2]}" "${both[${base}+3]}" retval="$?" if [ "$retval" = 254 ] then echo Sorry, ${both[${base}+0]} timed out, skipping this one 1>&2 echo Sorry, ${both[${base}+0]} timed out, skipping this one > "$resultfn" # "breakout" logic is to ensure that an attempt is made # to umount a mount with parameters that caused problems, so they aren't # left in place forevermore breakout=1 fi fi umt retval="$?" if [ "$retval" = 254 ] then echo Sorry, umount timed out, skipping this one 1>&2 echo Sorry, umount timed out, skipping this one > "$resultfn" # it appears that if AIX 5.1 (as an NFS client) times out mounting or umounting an NFS mount, then subsequent # mounts and/or umounts on the same mount point will all fail... So get a new mount point! NTTMPDIR="$(get_mount_point)" break fi if [ "$breakout" != 0 ] then break fi #endtime=$(python -c 'import time; print time.time()') endtime=$(gettime) echo "Start time was $starttime, End time was $endtime" v=$(echo $endtime - $starttime | bc) echo Difference is "$v" echo Writing "${both[${base}+1]}"': ' "$v" to "$resultfn" echo "${both[${base}+1]}"': ' $v > "$resultfn" done echo This remove can take a while... Please wait... rm -f "$testfn" echo Remove completed... done done done