Commit 6051bc89 authored by David M. Rogers's avatar David M. Rogers
Browse files

Updated srun command.

parent e103d0c1
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ def must_generate(dirname, tname, types):
        f = dirname / f
        if f.exists() and f.stat().st_mtime > mtime:
            print("File %s is newer than %s - re-running %s"%
                        (f, p, nrule.params['rulename']))
                        (f, fname, nrule.params['rulename']))
            return True
    return False

+33 −19
Original line number Diff line number Diff line
@@ -36,22 +36,20 @@ class ResourceSet:
        self.jsrun_attr = jsrun_attr
        self.srun_attr  = srun_attr

        self.jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % (
                      self.nrs, self.tasks, self.gpu, self.cpu)
        if self.jsrun_attr is not None:
            self.jsrun += " %s" % self.jsrun_attr

        self.srun = "srun --exclusive --ntasks %d -c %d -G %d" % (
                     self.nrs*self.tasks, self.cpu/self.tasks, self.nrs*self.gpu)
        if self.srun_attr is not None:
            self.srun += " %s" % self.srun_attr

        self.local = ""
    def __str__(self):
        return repr(self)
    def __repr__(self):
        return 'ResourceSet("%f, %d, %d, gpu=%d, tasks=%d, jsrun_attr=%s, srun_attr=%s")'%(
          self.time,self.nrs,self.cpu,self.gpu,self.tasks,self.jsrun_attr,self.srun_attr)
    # minimum number of nodes needed to run this (given CPU per node and GPU per node)
    def min_nodes(self, CpN, GpN):
        if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0:
            return None
        n = CpN // self.cpu
        if self.gpu > 0:
            n = min(n, GpN//self.gpu) # max nrs per node
        return (self.nrs+n-1) // n

    def ready(self):
        self.ready_time = tick()
@@ -63,7 +61,7 @@ class ResourceSet:
def machine(time):
    # Rhea
    if 'SLURM_JOB_NUM_NODES' in os.environ:
        return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 16, "srun") # or 32 (2-thread?)
        return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 32, "srun")

    # Summit
    if 'LSB_MAX_NUM_PROCESSORS' in os.environ:
@@ -77,27 +75,43 @@ class Machine:
    # N = Nodes
    # GpN = GPU / Node
    # CpN = CPU / Node
    def __init__(self, time, N, GpN, CpN, launcher):
    def __init__(self, time, N, GpN, CpN, batch):
        self.end_time = tick() + time
        self.GpN = GpN
        self.CpN = CpN
        self.launcher = launcher
        self.batch = batch

        self.N = [[GpN, CpN] for i in range(N)]

    def launcher(self, R):
        if self.batch == 'srun':
            nodes = R.min_nodes(self.CpN, self.GpN)
            assert nodes is not None
            srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % (
                         nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu)
            if R.srun_attr is not None:
                srun += " %s" % R.srun_attr
            return srun

        elif self.batch == 'jsrun':
            jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % (
                          R.nrs, R.tasks, R.gpu, R.cpu)
            if R.jsrun_attr is not None:
                jsrun += " %s" % R.jsrun_attr
            return jsrun

        raise IndexError(self.batch)

    # Returns None if this allocation is impossible to schedule.
    # Or a tuple (nodes, time) indicating max #nodes and avg. time taken.
    def possible(self, R):
        if R.time > self.end_time - tick():
            print("Unable to schedule due to time limitations.")
            return None
        if R.cpu > self.CpN or R.gpu > self.GpN or R.cpu < 1 or R.gpu < 0:
        nodes = R.min_nodes(self.CpN, self.GpN)
        if nodes is None:
            print("Unable to schedule due to inavailability of high gpu/cpu count nodes.")
            return None
        n = self.CpN//R.cpu
        if R.gpu > 0:
            n = min(n, self.GpN//R.gpu) # max nrs per node
        nodes = (R.nrs+n-1) // n
        if nodes > len(self.N):
            print("Unable to schedule due to inavailability of sufficient nodes.")
            return None
@@ -131,7 +145,7 @@ class Machine:
        if nrs != 0:
            return None

        cmd = getattr(R, self.launcher)
        cmd = self.launcher(R)
        if test:
            return cmd

+1 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from machine import ResourceSet
from fmatch import FMatch

jobscript = """
set -e
cd {dirname}
{setup}
%s