Bump up NFS wait time in qdo (since this has been causing
[gem5.git] / util / qdo
1 #! /usr/bin/env python
2
3 # Copyright (c) 2004-2005 The Regents of The University of Michigan
4 # All rights reserved.
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are
8 # met: redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer;
10 # redistributions in binary form must reproduce the above copyright
11 # notice, this list of conditions and the following disclaimer in the
12 # documentation and/or other materials provided with the distribution;
13 # neither the name of the copyright holders nor the names of its
14 # contributors may be used to endorse or promote products derived from
15 # this software without specific prior written permission.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29 import sys
30 import os
31 import re
32 import time
33 import optparse
34
35 import pexpect
36
37 progname = os.path.basename(sys.argv[0])
38
39 usage = "%prog [options] command [command arguments]"
40 optparser = optparse.OptionParser(usage=usage)
41 optparser.allow_interspersed_args=False
42 optparser.add_option('-e', dest='stderr_file',
43 help='command stderr output file')
44 optparser.add_option('-o', dest='stdout_file',
45 help='command stdout output file')
46 optparser.add_option('-l', dest='save_log', action='store_true',
47 help='save qsub output log file')
48 optparser.add_option('-N', dest='job_name',
49 help='qsub job name')
50 optparser.add_option('-q', dest='dest_queue',
51 help='qsub destination queue')
52 optparser.add_option('--qwait', dest='qsub_timeout', type='int',
53 help='qsub queue wait timeout', default=30*60)
54 optparser.add_option('-t', dest='cmd_timeout', type='int',
55 help='command execution timeout', default=600*60)
56
57 (options, cmd) = optparser.parse_args()
58
59 if cmd == []:
60 print >>sys.stderr, "%s: missing command" % progname
61 sys.exit(1)
62
63 # If we want to do this, need to add check here to make sure cmd[0] is
64 # a valid PBS job name, else qsub will die on us.
65 #
66 #if not options.job_name:
67 # options.job_name = cmd[0]
68
69 cwd = os.getcwd()
70
71 # Deal with systems where /n is a symlink to /.automount
72 if cwd.startswith('/.automount/'):
73 cwd = cwd.replace('/.automount/', '/n/', 1)
74
75 if not cwd.startswith('/n/poolfs/'):
76 print >>sys.stderr, "Error: current directory must be under /n/poolfs."
77 sys.exit(1)
78
79 # The Shell class wraps pexpect.spawn with some handy functions that
80 # assume the thing on the other end is a Bourne/bash shell.
81 class Shell(pexpect.spawn):
82 # Regexp to match the shell prompt. We change the prompt to
83 # something fixed and distinctive to make it easier to match
84 # reliably.
85 prompt_re = re.compile('qdo\$ ')
86
87 def __init__(self, cmd):
88 # initialize base pexpect.spawn object
89 try:
90 pexpect.spawn.__init__(self, cmd)
91 except pexpect.ExceptionPexpect, exc:
92 print "%s:" % progname, exc
93 sys.exit(1)
94 # full_output accumulates the full output of the session
95 self.full_output = ""
96 self.quick_timeout = 15
97 # wait for a prompt, then change it
98 try:
99 self.expect('\$ ', options.qsub_timeout)
100 except pexpect.TIMEOUT:
101 print >>sys.stderr, "%s: qsub timed out." % progname
102 self.kill(9)
103 self.close(wait=True)
104 sys.exit(1)
105 self.do_command('unset PROMPT_COMMAND; PS1="qdo$ "')
106
107 # version of expect that updates full_output too
108 def expect(self, regexp, timeout = -1):
109 pexpect.spawn.expect(self, regexp, timeout)
110 self.full_output += self.before + self.after
111
112 # Just issue a command and wait for the next prompt.
113 # Returns a string containing the output of the command.
114 def do_bare_command(self, cmd, timeout = -1):
115 global full_output
116 self.sendline(cmd)
117 # read back the echo of the command
118 self.readline()
119 # wait for the next prompt
120 self.expect(self.prompt_re, timeout)
121 output = self.before.rstrip()
122 return output
123
124 # Issue a command, then query its exit status.
125 # Returns a (string, int) tuple with the command output and the status.
126 def do_command(self, cmd, timeout = -1):
127 # do the command itself
128 output = self.do_bare_command(cmd, timeout)
129 # collect status
130 status = int(self.do_bare_command("echo $?", self.quick_timeout))
131 return (output, status)
132
133 # Check to see if the given directory exists.
134 def dir_exists(self, dirname):
135 (output, status) = shell.do_command('[ -d %s ]' % dirname,
136 self.quick_timeout)
137 return status == 0
138
139
140 # Spawn the interactive pool job.
141
142 # Hack to do link on poolfs... disabled for now since
143 # compiler/linker/library versioning problems between poolfs and
144 # nodes. May never work since poolfs is x86-64 and nodes are 32-bit.
145 if False and len(cmd) > 50:
146 shell_cmd = 'ssh -t poolfs /bin/sh -l'
147 print "%s: running %s on poolfs" % (progname, cmd[0])
148 else:
149 shell_cmd = 'qsub -I -S /bin/sh'
150 if options.job_name:
151 shell_cmd += ' -N "%s"' % options.job_name
152 if options.dest_queue:
153 shell_cmd += ' -q ' + options.dest_queue
154
155 shell = Shell(shell_cmd)
156
157 try:
158 # chdir to cwd
159 (output, status) = shell.do_command('cd ' + cwd)
160
161 if status != 0:
162 raise OSError, "Can't chdir to %s" % cwd
163
164 # wacky hack: sometimes scons will create an output directory then
165 # fork a job to generate files in that directory, and the job will
166 # get run before the directory creation propagates through NFS.
167 # This hack looks for a '-o' option indicating an output file and
168 # waits for the corresponding directory to appear if necessary.
169 try:
170 if 'cc' in cmd[0] or 'g++' in cmd[0]:
171 output_dir = os.path.dirname(cmd[cmd.index('-o')+1])
172 elif 'm5' in cmd[0]:
173 output_dir = cmd[cmd.index('-d')+1]
174 else:
175 output_dir = None
176 except (ValueError, IndexError):
177 # no big deal if there's no '-o'/'-d' or if it's the final argument
178 output_dir = None
179
180 if output_dir:
181 secs_waited = 0
182 while not shell.dir_exists(output_dir) and secs_waited < 90:
183 time.sleep(5)
184 secs_waited += 5
185 if secs_waited > 30:
186 print "waited", secs_waited, "seconds for", output_dir
187
188 # run command
189 if options.stdout_file:
190 cmd += ['>', options.stdout_file]
191 if options.stderr_file:
192 cmd += ['2>', options.stderr_file]
193 try:
194 (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout)
195 except pexpect.TIMEOUT:
196 print >>sys.stderr, "%s: command timed out after %d seconds." \
197 % (progname, options.cmd_timeout)
198 shell.sendline('~.') # qsub/ssh termination escape sequence
199 shell.close(wait=True)
200 status = 3
201 if output:
202 print output
203
204 finally:
205 # end job
206 if shell.isalive():
207 shell.sendline('exit')
208 shell.expect('qsub: job .* completed\r\n')
209 shell.close(wait=True)
210
211 # if there was an error, log the output even if not requested
212 if status != 0 or options.save_log:
213 log = file('qdo-log.' + str(os.getpid()), 'w')
214 log.write(shell.full_output)
215 log.close()
216
217 del shell
218
219 sys.exit(status)