Codebase list ibutils / upstream/1.5.7+0.2.gbd7e502 ibmgtsim / tests / osmStress.check.tcl
upstream/1.5.7+0.2.gbd7e502

Tree @upstream/1.5.7+0.2.gbd7e502 (Download .tar.gz)

osmStress.check.tcl @upstream/1.5.7+0.2.gbd7e502raw · history · blame

# This is the checker for the semi static lid assignment feature:

# A. the sim code should generate the cache file once the simulator is up.
# it should randomize:
# 1. some guids should not have a lid
# 2. some guids should share a lid
# 3. some extra guids should be there

# B. Wait for OpenSM SUBNET UP
#
# C. The simulator code should randomally do the following (several of each)
# 1. Zero some port lids
# 2. Copy some port lids to other ports
# 3. Invent some new lids to some ports
# 4. Turn some node ports down - disconect (all ports of the node)
#
# D. The simulator shoudl send a trap or set a switch change bit
#
# E. Wait for heavy sweep.
#
# F. The simulator code should verify that the lids match what it expects:
#    Note that the nodes that did have a non overlapping lid in the file
#    must have it. The rest of the ports should have valid lid values.
#

##############################################################################
#
# Start up the test applications
# This is the default flow that will start OpenSM only in 0x43 verbosity
# Return a list of process ids it started (to be killed on exit)
#
proc runner {simDir osmPath osmPortGuid} {
   global simCtrlSock
   global env
   global lmc

   set osmStdOutLog [file join $simDir osm.stdout.log]
   set osmLog [file join $simDir osm.log]

   set lmc 0
   fconfigure $simCtrlSock -blocking 1 -buffering line

   # randomize lids
   puts $simCtrlSock "assignLegalLids \$fabric $lmc"
   puts "SIM: [gets $simCtrlSock]"

   # Disconnect ports
   puts $simCtrlSock "setPortsDisconnected  \$fabric $lmc"
   puts "SIM: [gets $simCtrlSock]"

   # randomize guid2lid file:
   set env(OSM_CACHE_DIR) $simDir/
   puts $simCtrlSock "writeGuid2LidFile $simDir/guid2lid $lmc"
   puts "SIM: [gets $simCtrlSock]"

   file copy $simDir/guid2lid $simDir/guid2lid.orig

   set osmCmd "$osmPath -d2 -l $lmc -V -f $osmLog -g $osmPortGuid"
   puts "-I- Starting: $osmCmd"
   set osmPid [eval "exec $osmCmd > $osmStdOutLog &"]

   # start a tracker on the log file and process:
   startOsmLogAnalyzer $osmLog

   return $osmPid
}

##############################################################################
#
# Check for the test results: make sure we got a "SUBNET UP"
# Return the exit code
proc checker {simDir osmPath osmPortGuid} {
   global env
   global simCtrlSock
   global lmc
   global topologyFile
   set osmLog [file join $simDir osm.log]

   puts "-I- Waiting max time of 100sec...."

   if {[osmWaitForUpOrDeadWithTimeout $osmLog 1000000]} {
      return 1
   }

   # update node proc file
   puts $simCtrlSock "updateProcFSForNode \$fabric $simDir $env(IBMGTSIM_NODE) $env(IBMGTSIM_NODE) 1"
   set res [gets $simCtrlSock]
   if {$res == 1} {return 1}
   puts "SIM: Updated H-1 proc file:$res"

   # check for lid validity:
   puts $simCtrlSock "checkLidValues \$fabric $lmc"
   set res [gets $simCtrlSock]
   puts "SIM: Number of LID check errors:$res"
   if {$res != 0} {
      return $res
   }

   # we try several iterations of changes:
   for {set i 1} {$i < 2} {incr i} {
      # connect the disconnected
      puts $simCtrlSock "connectAllDisconnected \$fabric 1"
      puts "SIM: [gets $simCtrlSock]"

      # refresh the lid database and start the POST_SUBNET_UP mode
      puts $simCtrlSock "updateAssignedLids \$fabric"
      puts "SIM: [gets $simCtrlSock]"

      for {set j 1} {$j < 10} {incr j} {
         # Disconnect ports
         puts $simCtrlSock "setPortsDisconnected \$fabric $lmc"
         puts "SIM: [gets $simCtrlSock]"
         # connect the disconnected
         puts $simCtrlSock "connectAllDisconnected \$fabric 1"
         puts "SIM: [gets $simCtrlSock]"
      }

      # wait for sweep to end or exit
      if {[osmWaitForUpOrDeadWithTimeout $osmLog 1000000]} {
         return 1
      }
      puts $simCtrlSock "updateProcFSForNode \$fabric $simDir $env(IBMGTSIM_NODE) $env(IBMGTSIM_NODE) 1"
      set res [gets $simCtrlSock]
      if {$res == 1} {return 1}
      puts "SIM: Updated H-1 proc file:$res"
      set env(IBMGTSIM_NODE) $res

      # wait 3 seconds
      after 3000

      # check for lid validity:
      puts $simCtrlSock "checkLidValues \$fabric $lmc"
      set res [gets $simCtrlSock]
      puts "SIM: Number of LID check errors:$res"
      if {$res != 0} {
         return $res
      }

		# sending event forwarding notification requests...
      puts "-I- Sending event forwarding notification requests"
      puts $simCtrlSock "randomRegisterFormInformInfo fabric:1"
      set  returnVal [gets $simCtrlSock]
      puts "SIM: -I- $returnVal"

      # start Random Flow:
      set iterations 240
      puts "-I- Starting the random stress flow with $iterations..."
      puts $simCtrlSock "RunRandomStressFlow fabric:1 $iterations"
      set  returnVal [gets $simCtrlSock]
      puts "SIM: -I- $returnVal"

      # At the end, connect all the ports back
      puts "-I- Connecting all disconnected ..."
      puts $simCtrlSock "connectAllDisconnected \$fabric 1"
      set  returnVal [gets $simCtrlSock]
      puts "SIM: $returnVal"

      # wait for sweep to end or exit
      puts "-I- if we did connect some we need to wait for them"
      if {"-I- Reconnected 0 nodes" != $returnVal} {
         if {[osmWaitForUpOrDeadWithTimeout $osmLog 1000000]} {
            return 1
         }
      }

      # and yet another light sweep
      after 20000

      #At the end, join all to the multicast group
      puts "-I- Joining all Ports ..."
      set joinAllHCAs 1
      set interJoinDelay_ms 1
      puts $simCtrlSock "randomJoinAllHCAPorts fabric:1 $interJoinDelay_ms $joinAllHCAs"
      set  numHcasJoined [gets $simCtrlSock]
      puts "SIM: -I- Joined $numHcasJoined HCAs"

      # force a sweep:
      puts "-I- Forcing a sweep..."
      puts $simCtrlSock "setOneSwitchChangeBit \$fabric"
      set  returnVal [gets $simCtrlSock]
      puts "SIM: $returnVal"

      # wait for sweep to end or exit
      if {[osmWaitForUpOrDeadWithTimeout $osmLog 1000000]} {
         return 1
      }

      # wait ~1 sec per joining port - to enable the SM to complete connecting them
      after [expr $numHcasJoined * 1000]

      # use ibdiagnet instead of relying on opensm reports...
      if {0} {
         set ibdmchkLog [file join $simDir ibdmchk.log]
         set subnetFile [file join $simDir opensm-subnet.lst]
         set fdbsFile [file join $simDir opensm.fdbs]
         set mcfdbsFile [file join $simDir opensm.mcfdbs]
         set cmd "ibdmchk -s $subnetFile -f $fdbsFile -m $mcfdbsFile"

         puts "-I- Invoking $cmd "
         if {[catch {set res [eval "exec $cmd > $ibdmchkLog"]} e]} {
            puts "-E- ibdmchk failed"
            puts "-I- Result value $res"
            puts "-I- Error: $e"
            return 1
         }
      }

      set cmd "ibdiagnet -v -r -t $topologyFile -o $simDir -s $env(IBMGTSIM_NODE)"
      set ibdiagnetLog [file join $simDir ibdiagnet.stdout.log]
      puts "-I- Invoking $cmd "
      if {[catch {set res [eval "exec $cmd >& $ibdiagnetLog"]} e]} {
         puts "-E- ibdiagnet failed"
         puts "-I- Result value $res"
         puts "-I- Error: $e"
			return 1
      }

      # make sure all HCAs are now joined:
      set res [exec grep "Multicast Group:0xC000 has:" $ibdiagnetLog]
      if {![regexp {Multicast Group:0xC000 has:[0-9]+ switches and:([0-9]+) HCAs} $res d1 hcas]} {
         puts "-E- Fail to parse the Multicast registration ports:$res"
         return 1
      }

      if {$numHcasJoined != $hcas} {
         puts "-E- Not all HCAs are registered. Expected:$numHcasJoined got:$hcas"
         return 1
      }
   }

   return 0
}