util/qdo

   1 #! /usr/bin/env python
   2
   3 # Copyright (c) 2004-2005 The Regents of The University of Michigan
   4 # All rights reserved.
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are
   8 # met: redistributions of source code must retain the above copyright
   9 # notice, this list of conditions and the following disclaimer;
  10 # redistributions in binary form must reproduce the above copyright
  11 # notice, this list of conditions and the following disclaimer in the
  12 # documentation and/or other materials provided with the distribution;
  13 # neither the name of the copyright holders nor the names of its
  14 # contributors may be used to endorse or promote products derived from
  15 # this software without specific prior written permission.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28
  29 import sys
  30 import os
  31 import re
  32 import time
  33 import optparse
  34
  35 import pexpect
  36
  37 progname = os.path.basename(sys.argv[0])
  38
  39 usage = "%prog [options] command [command arguments]"
  40 optparser = optparse.OptionParser(usage=usage)
  41 optparser.allow_interspersed_args=False
  42 optparser.add_option('-e', dest='stderr_file',
  43                      help='command stderr output file')
  44 optparser.add_option('-o', dest='stdout_file',
  45                      help='command stdout output file')
  46 optparser.add_option('-l', dest='save_log', action='store_true',
  47                      help='save qsub output log file')
  48 optparser.add_option('-q', dest='qsub_timeout', type='int',
  49                      help='qsub queue wait timeout', default=30*60)
  50 optparser.add_option('-t', dest='cmd_timeout', type='int',
  51                      help='command execution timeout', default=600*60)
  52
  53 (options, cmd) = optparser.parse_args()
  54
  55 if cmd == []:
  56     print >>sys.stderr, "%s: missing command" % progname
  57     sys.exit(1)
  58
  59 cwd = os.getcwd()
  60
  61 # Deal with systems where /n is a symlink to /.automount
  62 if cwd.startswith('/.automount/'):
  63     cwd = cwd.replace('/.automount/', '/n/', 1)
  64
  65 if not cwd.startswith('/n/poolfs/'):
  66     print >>sys.stderr, "Error: current directory must be under /n/poolfs."
  67     sys.exit(1)
  68
  69 # The Shell class wraps pexpect.spawn with some handy functions that
  70 # assume the thing on the other end is a Bourne/bash shell.
  71 class Shell(pexpect.spawn):
  72     # Regexp to match the shell prompt.  We change the prompt to
  73     # something fixed and distinctive to make it easier to match
  74     # reliably.
  75     prompt_re = re.compile('qdo\$ ')
  76
  77     def __init__(self, cmd):
  78         # initialize base pexpect.spawn object
  79         try:
  80             pexpect.spawn.__init__(self, cmd)
  81         except pexpect.ExceptionPexpect, exc:
  82             print "%s:" % progname, exc
  83             sys.exit(1)
  84         # full_output accumulates the full output of the session
  85         self.full_output = ""
  86         self.quick_timeout = 15
  87         # wait for a prompt, then change it
  88         try:
  89             self.expect('\$ ', options.qsub_timeout)
  90         except pexpect.TIMEOUT:
  91             print >>sys.stderr, "%s: qsub timed out." % progname
  92             self.kill(15)
  93             self.close(wait=True)
  94             sys.exit(1)
  95         self.do_command('PS1="qdo$ "')
  96
  97     # version of expect that updates full_output too
  98     def expect(self, regexp, timeout = -1):
  99         pexpect.spawn.expect(self, regexp, timeout)
 100         self.full_output += self.before + self.after
 101
 102     # Just issue a command and wait for the next prompt.
 103     # Returns a string containing the output of the command.
 104     def do_bare_command(self, cmd, timeout = -1):
 105         global full_output
 106         self.sendline(cmd)
 107         # read back the echo of the command
 108         self.readline()
 109         # wait for the next prompt
 110         self.expect(self.prompt_re, timeout)
 111         output = self.before.rstrip()
 112         return output
 113
 114     # Issue a command, then query its exit status.
 115     # Returns a (string, int) tuple with the command output and the status.
 116     def do_command(self, cmd, timeout = -1):
 117         # do the command itself
 118         output = self.do_bare_command(cmd, timeout)
 119         # collect status
 120         status = int(self.do_bare_command("echo $?", self.quick_timeout))
 121         return (output, status)
 122
 123     # Check to see if the given directory exists.
 124     def dir_exists(self, dirname):
 125         (output, status) = shell.do_command('[ -d %s ]' % dirname,
 126                                             self.quick_timeout)
 127         return status == 0
 128
 129
 130 # Spawn the interactive pool job.
 131
 132 # Hack to do link on poolfs... disabled for now since
 133 # compiler/linker/library versioning problems between poolfs and
 134 # nodes.  May never work since poolfs is x86-64 and nodes are 32-bit.
 135 if False and len(cmd) > 50:
 136     shell_cmd = 'ssh -t poolfs /bin/sh -l'
 137     print "%s: running %s on poolfs" % (progname, cmd[0])
 138 else:
 139     shell_cmd = 'qsub -I -S /bin/sh'
 140
 141 shell = Shell(shell_cmd)
 142
 143 try:
 144     # chdir to cwd
 145     (output, status) = shell.do_command('cd ' + cwd)
 146
 147     if status != 0:
 148         raise OSError, "Can't chdir to %s" % cwd
 149
 150     # wacky hack: sometimes scons will create an output directory then
 151     # fork a job to generate files in that directory, and the job will
 152     # get run before the directory creation propagates through NFS.
 153     # This hack looks for a '-o' option indicating an output file and
 154     # waits for the corresponding directory to appear if necessary.
 155     try:
 156         if 'cc' in cmd[0] or 'g++' in cmd[0]:
 157             output_dir = os.path.dirname(cmd[cmd.index('-o')+1])
 158         elif 'm5' in cmd[0]:
 159             output_dir = cmd[cmd.index('-d')+1]
 160         else:
 161             output_dir = None
 162     except (ValueError, IndexError):
 163         # no big deal if there's no '-o'/'-d' or if it's the final argument
 164         output_dir = None
 165
 166     if output_dir:
 167         secs_waited = 0
 168         while not shell.dir_exists(output_dir) and secs_waited < 45:
 169             time.sleep(5)
 170             secs_waited += 5
 171         if secs_waited > 10:
 172             print "waited", secs_waited, "seconds for", output_dir
 173
 174     # run command
 175     if options.stdout_file:
 176         cmd += ['>', options.stdout_file]
 177     if options.stderr_file:
 178         cmd += ['2>', options.stderr_file]
 179     try:
 180         (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout)
 181     except pexpect.TIMEOUT:
 182             print >>sys.stderr, "%s: command timed out after %d seconds." \
 183                   % (progname, options.cmd_timeout)
 184             shell.sendline('~.') # qsub/ssh termination escape sequence
 185             shell.close(wait=True)
 186             status = 3
 187     if output:
 188         print output
 189
 190 finally:
 191     # end job
 192     if shell.isalive():
 193         shell.sendline('exit')
 194         shell.expect('qsub: job .* completed\r\n')
 195         shell.close(wait=True)
 196
 197     # if there was an error, log the output even if not requested
 198     if status != 0 or options.save_log:
 199         log = file('qdo-log.' + str(os.getpid()), 'w')
 200         log.write(shell.full_output)
 201         log.close()
 202
 203 del shell
 204
 205 sys.exit(status)