pymtt
 All Classes Namespaces Files Functions Variables Groups
SLURM.py
Go to the documentation of this file.
1 # -*- coding: utf-8; tab-width: 4; indent-tabs-mode: f; python-indent: 4 -*-
2 #
3 # Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
4 # $COPYRIGHT$
5 #
6 # Additional copyrights may follow
7 #
8 # $HEADER$
9 #
10 
11 from __future__ import print_function
12 import os
13 from LauncherMTTTool import *
14 import shlex
15 
16 ## @addtogroup Tools
17 # @{
18 # @addtogroup Launcher
19 # @section SLURM
20 # Plugin for using SLURM to launch tests
21 # @param hostfile The hostfile for OpenMPI to use
22 # @param command Command for executing the application
23 # @param np Number of processes to run
24 # @param timeout Maximum execution time - terminate a test if it exceeds this time
25 # @param options Comma-delimited sets of command line options that shall be used on each test
26 # @param skipped Exit status of a test that declares it was skipped
27 # @param merge_stdout_stderr Merge stdout and stderr into one output stream
28 # @param stdout_save_lines Number of lines of stdout to save
29 # @param stderr_save_lines Number of lines of stderr to save
30 # @param test_dir Names of directories to be scanned for tests
31 # @param fail_tests Names of tests that are expected to fail
32 # @param fail_returncodes Expected returncodes of tests expected to fail
33 # @param fail_timeout Maximum execution time for tests expected to fail
34 # @param skip_tests Names of tests to be skipped
35 # @param max_num_tests Maximum number of tests to run
36 # @param job_name User-defined name for job
37 # @param modules Modules to load
38 # @param modules_unload Modules to unload
39 # @param test_list List of tests to run, default is all
40 # @param allocate_cmd Command to use for allocating nodes from the resource manager
41 # @param deallocate_cmd Command to use for deallocating nodes from the resource manager
42 # @}
44 
45  def __init__(self):
46  # initialise parent class
47  LauncherMTTTool.__init__(self)
48  self.options = {}
49  self.options['hostfile'] = (None, "The hostfile for SLURM to use")
50  self.options['command'] = ("srun", "Command for executing the application")
51  self.options['np'] = (None, "Number of processes to run")
52  self.options['timeout'] = (None, "Maximum execution time - terminate a test if it exceeds this time")
53  self.options['options'] = (None, "Comma-delimited sets of command line options that shall be used on each test")
54  self.options['skipped'] = ("77", "Exit status of a test that declares it was skipped")
55  self.options['merge_stdout_stderr'] = (False, "Merge stdout and stderr into one output stream")
56  self.options['stdout_save_lines'] = (-1, "Number of lines of stdout to save")
57  self.options['stderr_save_lines'] = (-1, "Number of lines of stderr to save")
58  self.options['test_dir'] = (None, "Names of directories to be scanned for tests")
59  self.options['fail_tests'] = (None, "Names of tests that are expected to fail")
60  self.options['fail_returncodes'] = (None, "Expected return code of tests expected to fail")
61  self.options['fail_timeout'] = (None, "Maximum execution time for tests expected to fail")
62  self.options['skip_tests'] = (None, "Names of tests to be skipped")
63  self.options['max_num_tests'] = (None, "Maximum number of tests to run")
64  self.options['job_name'] = (None, "User-defined name for job")
65  self.options['modules'] = (None, "Modules to load")
66  self.options['modules_unload'] = (None, "Modules to unload")
67  self.options['test_list'] = (None, "List of tests to run, default is all")
68  self.options['allocate_cmd'] = (None, "Command to use for allocating nodes from the resource manager")
69  self.options['deallocate_cmd'] = (None, "Command to use for deallocating nodes from the resource manager")
70 
71  self.allocated = False
72  self.testDef = None
73  self.cmds = None
74  return
75 
76 
77  def activate(self):
78  # use the automatic procedure from IPlugin
79  IPlugin.activate(self)
80  return
81 
82 
83  def deactivate(self):
84  IPlugin.deactivate(self)
85  if self.allocated and self.testDef and self.cmds:
86  deallocate_cmdargs = shlex.split(self.cmds['deallocate_cmd'])
87  _status,_stdout,_stderr,_time = self.testDef.execmd.execute(self.cmds, deallocate_cmdargs, self.testDef)
88  self.allocated = False
89 
90 
91  def print_name(self):
92  return "SLURM"
93 
94  def print_options(self, testDef, prefix):
95  lines = testDef.printOptions(self.options)
96  for line in lines:
97  print(prefix + line)
98  return
99 
100  def execute(self, log, keyvals, testDef):
101 
102  self.testDef = testDef
103 
104  midpath = False
105 
106  testDef.logger.verbose_print("SLURM Launcher")
107  # check the log for the title so we can
108  # see if this is setting our default behavior
109  try:
110  if log['section'] is not None:
111  if "Default" in log['section']:
112  # this section contains default settings
113  # for this launcher
114  myopts = {}
115  testDef.parseOptions(log, self.options, keyvals, myopts)
116  # transfer the findings into our local storage
117  keys = list(self.options.keys())
118  optkeys = list(myopts.keys())
119  for optkey in optkeys:
120  for key in keys:
121  if key == optkey:
122  self.options[key] = (myopts[optkey],self.options[key][1])
123 
124  # we captured the default settings, so we can
125  # now return with success
126  log['status'] = 0
127  return
128  except KeyError:
129  # error - the section should have been there
130  log['status'] = 1
131  log['stderr'] = "Section not specified"
132  return
133  # must be executing a test of some kind - the install stage
134  # must be specified so we can find the tests to be run
135  try:
136  parent = keyvals['parent']
137  if parent is not None:
138  # get the log entry as it contains the location
139  # of the built tests
140  bldlog = testDef.logger.getLog(parent)
141  try:
142  location = bldlog['location']
143  except KeyError:
144  # if it wasn't recorded, then there is nothing
145  # we can do
146  log['status'] = 1
147  log['stderr'] = "Location of built tests was not provided"
148  return
149  try:
150  if bldlog['parameters'] is not None:
151  # check for modules unloaded during the build of these tests
152  for md in bldlog['parameters']:
153  if "modules_unload" == md[0]:
154  try:
155  if keyvals['modules_unload'] is not None:
156  # append these modules to those
157  mods = md[1].split(',')
158  newmods = keyvals['modules_unload'].split(',')
159  for mdx in newmods:
160  mods.append(mdx)
161  keyvals['modules_unload'] = ','.join(mods)
162  except KeyError:
163  keyvals['modules_unload'] = md[1]
164  break
165  # check for modules used during the build of these tests
166  for md in bldlog['parameters']:
167  if "modules" == md[0]:
168  try:
169  if keyvals['modules'] is not None:
170  # append these modules to those
171  mods = md[1].split(',')
172  newmods = keyvals['modules'].split(',')
173  for mdx in newmods:
174  mods.append(mdx)
175  keyvals['modules'] = ','.join(mods)
176  except KeyError:
177  keyvals['modules'] = md[1]
178  break
179  except KeyError:
180  pass
181  # get the log of any middleware so we can get its location
182  try:
183  midlog = testDef.logger.getLog(bldlog['middleware'])
184  if midlog is not None:
185  # get the location of the middleware
186  try:
187  if midlog['location'] is not None:
188  # prepend that location to our paths
189  try:
190  oldbinpath = os.environ['PATH']
191  pieces = oldbinpath.split(':')
192  except KeyError:
193  oldbinpath = ""
194  pieces = []
195  bindir = os.path.join(midlog['location'], "bin")
196  pieces.insert(0, bindir)
197  newpath = ":".join(pieces)
198  os.environ['PATH'] = newpath
199  # prepend the loadable lib path
200  try:
201  oldldlibpath = os.environ['LD_LIBRARY_PATH']
202  pieces = oldldlibpath.split(':')
203  except KeyError:
204  oldldlibpath = ""
205  pieces = []
206  bindir = os.path.join(midlog['location'], "lib")
207  pieces.insert(0, bindir)
208  newpath = ":".join(pieces)
209  os.environ['LD_LIBRARY_PATH'] = newpath
210 
211  # mark that this was done
212  midpath = True
213  except KeyError:
214  # if it was already installed, then no location would be provided
215  pass
216  try:
217  if midlog['parameters'] is not None:
218  # check for modules unloaded by the middleware
219  for md in midlog['parameters']:
220  if "modules_unload" == md[0]:
221  try:
222  if keyvals['modules_unload'] is not None:
223  # append these modules to those
224  mods = md[1].split(',')
225  newmods = keyvals['modules_unload'].split(',')
226  for mdx in newmods:
227  mods.append(mdx)
228  keyvals['modules_unload'] = ','.join(mods)
229  except KeyError:
230  keyvals['modules_unload'] = md[1]
231  break
232  # check for modules required by the middleware
233  for md in midlog['parameters']:
234  if "modules" == md[0]:
235  try:
236  if keyvals['modules'] is not None:
237  # append these modules to those
238  mods = md[1].split(',')
239  newmods = keyvals['modules'].split(',')
240  for mdx in newmods:
241  mods.append(mdx)
242  keyvals['modules'] = ','.join(mods)
243  except KeyError:
244  keyvals['modules'] = md[1]
245  break
246  except KeyError:
247  pass
248  except KeyError:
249  pass
250  except KeyError:
251  log['status'] = 1
252  log['stderr'] = "Parent test build stage was not provided"
253  return
254  # parse any provided options - these will override the defaults
255  cmds = {}
256  testDef.parseOptions(log, self.options, keyvals, cmds)
257  self.cmds = cmds
258  # now ready to execute the test - we are pointed at the middleware
259  # and have obtained the list of any modules associated with it. We need
260  # to change to the test location and begin executing, first saving
261  # our current location so we can return when done
262  cwd = os.getcwd()
263  os.chdir(location)
264  # did they give us a list of specific directories where the desired
265  # tests to be executed reside?
266  tests = []
267  if cmds['test_list'] is None:
268  try:
269  if cmds['test_dir'] is not None:
270  # pick up the executables from the specified directories
271  dirs = cmds['test_dir'].split()
272  for dr in dirs:
273  dr = dr.strip()
274  # remove any commas and quotes
275  dr = dr.replace('\"','')
276  dr = dr.replace(',','')
277  for dirName, subdirList, fileList in os.walk(dr):
278  for fname in fileList:
279  # see if this is an executable
280  filename = os.path.abspath(os.path.join(dirName,fname))
281  if os.path.isfile(filename) and os.access(filename, os.X_OK):
282  # add this file to our list of tests to execute
283  tests.append(filename)
284  else:
285  # get the list of executables from this directory and any
286  # subdirectories beneath it
287  for dirName, subdirList, fileList in os.walk("."):
288  for fname in fileList:
289  # see if this is an executable
290  filename = os.path.abspath(os.path.join(dirName,fname))
291  if os.path.isfile(filename) and os.access(filename, os.X_OK):
292  # add this file to our list of tests to execute
293  tests.append(filename)
294  except KeyError:
295  # get the list of executables from this directory and any
296  # subdirectories beneath it
297  for dirName, subdirList, fileList in os.walk("."):
298  for fname in fileList:
299  # see if this is an executable
300  filename = os.path.abspath(os.path.join(dirName,fname))
301  if os.path.isfile(filename) and os.access(filename, os.X_OK):
302  # add this file to our list of tests to execute
303  tests.append(filename)
304  # If list of tests is provided, use list rather than grabbing all tests
305  else:
306  if cmds['test_dir'] is not None:
307  dirs = cmds['test_dir'].split()
308  else:
309  dirs = ['.']
310  for dr in dirs:
311  dr = dr.strip()
312  dr = dr.replace('\"','')
313  dr = dr.replace(',','')
314  for dirName, subdirList, fileList in os.walk(dr):
315  for fname_cmd in cmds['test_list'].split("\n"):
316  fname = fname_cmd.strip().split(" ")[0]
317  fname_args = " ".join(fname_cmd.strip().split(" ")[1:])
318  if fname not in fileList:
319  continue
320  filename = os.path.abspath(os.path.join(dirName,fname))
321  if os.path.isfile(filename) and os.access(filename, os.X_OK):
322  tests.append((filename+" "+fname_args).strip())
323  # check that we found something
324  if not tests:
325  log['status'] = 1
326  log['stderr'] = "No tests found"
327  os.chdir(cwd)
328  return
329  # get the "skip" exit status
330  skipStatus = int(cmds['skipped'])
331  # assemble the command
332  cmdargs = [cmds['command']]
333 
334  # Add support for using job_name with mpiexec
335  if (cmds['command'] == 'mpiexec' or cmds['command'] == 'mpiexec.hydra' or cmds['command'] == 'mpirun') and cmds['job_name'] is not None:
336  if cmds['options'] is None or (cmds['options'] is not None and '-bootstrap slurm' not in cmds['options']):
337  # Check if this is a negative test using fail_tests=ini_check
338  if cmds['fail_tests'] is not None and 'ini_check' in cmds['fail_tests']:
339  log['status'] = 0
340  # log the results directly since this will be marked as a pass
341  testDef.logger.verbose_print('stdout: ' + "%s used, but \"-bootstrap slurm\" not in options" % cmds['command'])
342  else:
343  log['status'] = 1
344  log['stderr'] = "%s used, but \"-bootstrap slurm\" not in options" % cmds['command']
345  os.chdir(cwd)
346  return
347  cmdargs.append("-bootstrap-exec-args")
348  cmdargs.append("--job-name=%s"%cmds['job_name'])
349  elif cmds['command'] == 'srun' and cmds['job_name'] is not None:
350  cmdargs.append("--job-name")
351  cmdargs.append(cmds['job_name'])
352 
353  if cmds['options'] is not None:
354  for op in cmds['options'].split():
355  cmdargs.append(op)
356  if (cmds['command'] == 'mpiexec' or cmds['command'] == 'mpiexec.hydra' or cmds['command'] == 'mpirun') and cmds['np'] is not None:
357  cmdargs.append("-np")
358  cmdargs.append(cmds['np'])
359  elif cmds['command'] == 'srun' and cmds['np'] is not None:
360  cmdargs.append("-n")
361  cmdargs.append(cmds['np'])
362  if cmds['hostfile'] is not None:
363  cmdargs.append("-hostfile")
364  cmdargs.append(cmds['hostfile'])
365  # cycle thru the list of tests and execute each of them
366  log['testresults'] = []
367  finalStatus = 0
368  finalError = ""
369  numTests = 0
370  numPass = 0
371  numSkip = 0
372  numFail = 0
373  if cmds['max_num_tests'] is not None:
374  maxTests = int(cmds['max_num_tests'])
375  else:
376  maxTests = 10000000
377 
378  # unload modules that were removed during the middleware or test build
379  usedModuleUnload = False
380  try:
381  if cmds['modules_unload'] is not None:
382  status,stdout,stderr = testDef.modcmd.unloadModules(cmds['modules_unload'], testDef)
383  if 0 != status:
384  log['status'] = status
385  log['stderr'] = stderr
386  os.chdir(cwd)
387  return
388  usedModuleUnload = True
389  except KeyError:
390  # not required to provide a module to unload
391  pass
392  # Load modules that were required during the middleware or test build
393  usedModule = False
394  try:
395  if cmds['modules'] is not None:
396  status,stdout,stderr = testDef.modcmd.loadModules(cmds['modules'], testDef)
397  if 0 != status:
398  log['status'] = status
399  log['stderr'] = stderr
400  os.chdir(cwd)
401  return
402  usedModule = True
403  except KeyError:
404  # not required to provide a module
405  pass
406 
407  fail_tests = cmds['fail_tests']
408  if fail_tests is not None:
409  fail_tests = [t.strip() for t in fail_tests.split("\n")]
410  else:
411  fail_tests = []
412  for i,t in enumerate(fail_tests):
413  for t2 in tests:
414  if (t2.split(" ")[0].split("/")[-1]+" "+" ".join(t2.split(" ")[1:])).strip() == t.strip():
415  fail_tests[i] = t2
416  fail_returncodes = cmds['fail_returncodes']
417  if fail_returncodes is not None:
418  fail_returncodes = [int(t.strip()) for t in fail_returncodes.split("\n")]
419 
420  if fail_tests is None:
421  expected_returncodes = {test:0 for test in tests}
422  else:
423  if fail_returncodes is None:
424  expected_returncodes = {test:(None if test in fail_tests else 0) for test in tests}
425  else:
426  fail_returncodes = {test:rtncode for test,rtncode in zip(fail_tests,fail_returncodes)}
427  expected_returncodes = {test:(fail_returncodes[test] if test in fail_returncodes else 0) for test in tests}
428 
429  # Allocate cluster
430  self.allocated = False
431  if cmds['allocate_cmd'] is not None and cmds['deallocate_cmd'] is not None:
432  self.allocated = True
433  allocate_cmdargs = shlex.split(cmds['allocate_cmd'])
434  _status,_stdout,_stderr,_time = testDef.execmd.execute(cmds, allocate_cmdargs, testDef)
435  if 0 != _status:
436  log['status'] = _status
437  log['stderr'] = _stderr
438  os.chdir(cwd)
439  return
440 
441  # Execute all tests
442  for test in tests:
443  # Skip tests that are in "skip_tests" ini input
444  if cmds['skip_tests'] is not None and (test.split(" ")[0].split('/')[-1]+" "+" ".join(test.split(" ")[1:])).strip() in [st.strip() for st in cmds['skip_tests'].split()]:
445  numTests += 1
446  numSkip += 1
447  if numTests == maxTests:
448  break
449  continue
450  testLog = {'test':test}
451  cmdargs.extend(shlex.split(test))
452  testLog['cmd'] = " ".join(["\"%s\"" % cmdarg if " " in cmdarg else cmdarg for cmdarg in cmdargs])
453 
454  harass_exec_ids = testDef.harasser.start(testDef)
455 
456  harass_check = testDef.harasser.check(harass_exec_ids, testDef)
457  if harass_check is not None:
458  testLog['stderr'] = 'Not all harasser scripts started. These failed to start: ' \
459  + ','.join([h_info[1]['start_script'] for h_info in harass_check[0]])
460  testLog['time'] = sum([r_info[3] for r_info in harass_check[1]])
461  testLog['status'] = 1
462  finalStatus = 1
463  finalError = testLog['stderr']
464  numFail = numFail + 1
465  testDef.harasser.stop(harass_exec_ids, testDef)
466  continue
467 
468  status,stdout,stderr,time = testDef.execmd.execute(cmds, cmdargs, testDef)
469 
470  testDef.harasser.stop(harass_exec_ids, testDef)
471 
472  if ((expected_returncodes[test] is None and 0 == status) or (expected_returncodes[test] is not None and expected_returncodes[test] != status)) and skipStatus != status and 0 == finalStatus:
473  if expected_returncodes[test] == 0:
474  finalStatus = status
475  else:
476  finalStatus = 1
477  finalError = stderr
478  if (expected_returncodes[test] is None and 0 != status) or (expected_returncodes[test] == status):
479  numPass = numPass + 1
480  elif skipStatus == status:
481  numSkip = numSkip + 1
482  else:
483  numFail = numFail + 1
484  if expected_returncodes[test] == 0:
485  testLog['status'] = status
486  else:
487  if status == expected_returncodes[test]:
488  testLog['status'] = 0
489  else:
490  testLog['status'] = 1
491  testLog['stdout'] = stdout
492  testLog['stderr'] = stderr
493  testLog['time'] = time
494  log['testresults'].append(testLog)
495  cmdargs = cmdargs[:-1]
496  numTests = numTests + 1
497  if numTests == maxTests:
498  break
499 
500  # Deallocate cluster
501  if cmds['allocate_cmd'] is not None and cmds['deallocate_cmd'] is not None and self.allocated:
502  deallocate_cmdargs = shlex.split(cmds['deallocate_cmd'])
503  _status,_stdout,_stderr,_time = testDef.execmd.execute(cmds, deallocate_cmdargs, testDef)
504  if 0 != _status:
505  log['status'] = _status
506  log['stderr'] = _stderr
507  os.chdir(cwd)
508  return
509  self.allocated = False
510 
511  log['status'] = finalStatus
512  log['stderr'] = finalError
513  log['numTests'] = numTests
514  log['numPass'] = numPass
515  log['numSkip'] = numSkip
516  log['numFail'] = numFail
517 
518  # handle case where srun is used instead of mpirun for number of processes (np)
519  if cmds['command'] == 'srun':
520  num_tasks = None
521  num_nodes = None
522  num_tasks_per_node = None
523 
524  if '-n ' in cmds['options']:
525  num_tasks = str(cmds['options'].split('-n ')[1].split(' ')[0])
526  if '--ntasks=' in cmds['options']:
527  num_tasks = str(cmds['options'].split('--ntasks=')[1].split(' ')[0])
528  if '-N ' in cmds['options']:
529  num_nodes = str(cmds['options'].split('-N ')[1].split(' ')[0])
530  if '--nodes=' in cmds['options']:
531  num_nodes = str(cmds['options'].split('--nodes=')[1].split(' ')[0])
532  if '-w ' in cmds['options']:
533  num_nodes = str(len(cmds['options'].split('-w ')[1].split(' ')[0].split(',')))
534  if '--nodelist=' in cmds['options']:
535  num_nodes = str(len(cmds['options'].split('--nodelist=')[1].split(' ')[0].split(',')))
536  if '--ntasks-per-node=' in cmds['options']:
537  num_tasks_per_node = str(cmds['options'].split('--ntasks-per-node=')[1].split(' ')[0])
538 
539  if num_tasks is not None:
540  log['np'] = num_tasks
541  elif num_nodes is not None and num_tasks_per_node is not None:
542  try:
543  log['np'] = str(int(num_tasks_per_node)*int(num_nodes))
544  except:
545  log['np'] = None
546  else:
547  log['np'] = None
548  elif cmds['command'] == 'mpiexec' or cmds['command'] == 'mpiexec.hydra' or cmds['command'] == 'mpirun':
549  num_tasks = None
550  num_nodes = None
551  num_tasks_per_node = None
552 
553  if '-n ' in cmds['options']:
554  num_tasks = str(cmds['options'].split('-n ')[1].split(' ')[0])
555  if '-np ' in cmds['options']:
556  num_tasks = str(cmds['options'].split('-np ')[1].split(' ')[0])
557  if '-hosts ' in cmds['options']:
558  num_nodes = str(len(cmds['options'].split('-hosts ')[1].split(' ')[0]))
559  if '-ppn ' in cmds['options']:
560  num_tasks_per_node = str(cmds['options'].split('-ppn ')[1].split(' ')[0])
561  if '-grr ' in cmds['options']:
562  num_tasks_per_node = str(cmds['options'].split('-grr ')[1].split(' ')[0])
563  if '-perhost ' in cmds['options']:
564  num_tasks_per_node = str(cmds['options'].split('-perhost ')[1].split(' ')[0])
565 
566  if num_tasks is not None:
567  log['np'] = num_tasks
568  elif num_nodes is not None and num_tasks_per_node is not None:
569  try:
570  log['np'] = str(int(num_tasks_per_node)*int(num_nodes))
571  except:
572  log['np'] = None
573  else:
574  log['np'] = None
575  else:
576  try:
577  log['np'] = cmds['np']
578  except KeyError:
579  log['np'] = None
580 
581  if usedModule:
582  # unload the modules before returning
583  status,stdout,stderr = testDef.modcmd.unloadModules(cmds['modules'], testDef)
584  if 0 != status:
585  log['status'] = status
586  log['stderr'] = stderr
587  os.chdir(cwd)
588  return
589  if usedModuleUnload:
590  status,stdout,stderr = testDef.modcmd.loadModules(cmds['modules_unload'], testDef)
591  if 0 != status:
592  log['status'] = status
593  log['stderr'] = stderr
594  os.chdir(cwd)
595  return
596  # if we added middleware to the paths, remove it
597  if midpath:
598  os.environ['PATH'] = oldbinpath
599  os.environ['LD_LIBRARY_PATH'] = oldldlibpath
600 
601  os.chdir(cwd)
602  return
def print_name
Definition: SLURM.py:91
def __init__
Definition: SLURM.py:45
def print_options
Definition: SLURM.py:94
def activate
Definition: SLURM.py:77
def deactivate
Definition: SLURM.py:83
def execute
Definition: SLURM.py:100