Loading graph.py +1 −1 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ def must_generate(dirname, tname, types): f = dirname / f if f.exists() and f.stat().st_mtime > mtime: print("File %s is newer than %s - re-running %s"% (f, p, nrule.params['rulename'])) (f, fname, nrule.params['rulename'])) return True return False Loading machine.py +33 −19 Original line number Diff line number Diff line Loading @@ -36,22 +36,20 @@ class ResourceSet: self.jsrun_attr = jsrun_attr self.srun_attr = srun_attr self.jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % ( self.nrs, self.tasks, self.gpu, self.cpu) if self.jsrun_attr is not None: self.jsrun += " %s" % self.jsrun_attr self.srun = "srun --exclusive --ntasks %d -c %d -G %d" % ( self.nrs*self.tasks, self.cpu/self.tasks, self.nrs*self.gpu) if self.srun_attr is not None: self.srun += " %s" % self.srun_attr self.local = "" def __str__(self): return repr(self) def __repr__(self): return 'ResourceSet("%f, %d, %d, gpu=%d, tasks=%d, jsrun_attr=%s, srun_attr=%s")'%( self.time,self.nrs,self.cpu,self.gpu,self.tasks,self.jsrun_attr,self.srun_attr) # minimum number of nodes needed to run this (given CPU per node and GPU per node) def min_nodes(self, CpN, GpN): if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0: return None n = CpN // self.cpu if self.gpu > 0: n = min(n, GpN//self.gpu) # max nrs per node return (self.nrs+n-1) // n def ready(self): self.ready_time = tick() Loading @@ -63,7 +61,7 @@ class ResourceSet: def machine(time): # Rhea if 'SLURM_JOB_NUM_NODES' in os.environ: return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 16, "srun") # or 32 (2-thread?) return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 32, "srun") # Summit if 'LSB_MAX_NUM_PROCESSORS' in os.environ: Loading @@ -77,27 +75,43 @@ class Machine: # N = Nodes # GpN = GPU / Node # CpN = CPU / Node def __init__(self, time, N, GpN, CpN, launcher): def __init__(self, time, N, GpN, CpN, batch): self.end_time = tick() + time self.GpN = GpN self.CpN = CpN self.launcher = launcher self.batch = batch self.N = [[GpN, CpN] for i in range(N)] def launcher(self, R): if self.batch == 'srun': nodes = R.min_nodes(self.CpN, self.GpN) assert nodes is not None srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % ( nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu) if R.srun_attr is not None: srun += " %s" % R.srun_attr return srun elif self.batch == 'jsrun': jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % ( R.nrs, R.tasks, R.gpu, R.cpu) if R.jsrun_attr is not None: jsrun += " %s" % R.jsrun_attr return jsrun raise IndexError(self.batch) # Returns None if this allocation is impossible to schedule. # Or a tuple (nodes, time) indicating max #nodes and avg. time taken. def possible(self, R): if R.time > self.end_time - tick(): print("Unable to schedule due to time limitations.") return None if R.cpu > self.CpN or R.gpu > self.GpN or R.cpu < 1 or R.gpu < 0: nodes = R.min_nodes(self.CpN, self.GpN) if nodes is None: print("Unable to schedule due to inavailability of high gpu/cpu count nodes.") return None n = self.CpN//R.cpu if R.gpu > 0: n = min(n, self.GpN//R.gpu) # max nrs per node nodes = (R.nrs+n-1) // n if nodes > len(self.N): print("Unable to schedule due to inavailability of sufficient nodes.") return None Loading Loading @@ -131,7 +145,7 @@ class Machine: if nrs != 0: return None cmd = getattr(R, self.launcher) cmd = self.launcher(R) if test: return cmd Loading rules.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from machine import ResourceSet from fmatch import FMatch jobscript = """ set -e cd {dirname} {setup} %s Loading Loading
graph.py +1 −1 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ def must_generate(dirname, tname, types): f = dirname / f if f.exists() and f.stat().st_mtime > mtime: print("File %s is newer than %s - re-running %s"% (f, p, nrule.params['rulename'])) (f, fname, nrule.params['rulename'])) return True return False Loading
machine.py +33 −19 Original line number Diff line number Diff line Loading @@ -36,22 +36,20 @@ class ResourceSet: self.jsrun_attr = jsrun_attr self.srun_attr = srun_attr self.jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % ( self.nrs, self.tasks, self.gpu, self.cpu) if self.jsrun_attr is not None: self.jsrun += " %s" % self.jsrun_attr self.srun = "srun --exclusive --ntasks %d -c %d -G %d" % ( self.nrs*self.tasks, self.cpu/self.tasks, self.nrs*self.gpu) if self.srun_attr is not None: self.srun += " %s" % self.srun_attr self.local = "" def __str__(self): return repr(self) def __repr__(self): return 'ResourceSet("%f, %d, %d, gpu=%d, tasks=%d, jsrun_attr=%s, srun_attr=%s")'%( self.time,self.nrs,self.cpu,self.gpu,self.tasks,self.jsrun_attr,self.srun_attr) # minimum number of nodes needed to run this (given CPU per node and GPU per node) def min_nodes(self, CpN, GpN): if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0: return None n = CpN // self.cpu if self.gpu > 0: n = min(n, GpN//self.gpu) # max nrs per node return (self.nrs+n-1) // n def ready(self): self.ready_time = tick() Loading @@ -63,7 +61,7 @@ class ResourceSet: def machine(time): # Rhea if 'SLURM_JOB_NUM_NODES' in os.environ: return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 16, "srun") # or 32 (2-thread?) return Machine(time, int(os.environ['SLURM_JOB_NUM_NODES']), 0, 32, "srun") # Summit if 'LSB_MAX_NUM_PROCESSORS' in os.environ: Loading @@ -77,27 +75,43 @@ class Machine: # N = Nodes # GpN = GPU / Node # CpN = CPU / Node def __init__(self, time, N, GpN, CpN, launcher): def __init__(self, time, N, GpN, CpN, batch): self.end_time = tick() + time self.GpN = GpN self.CpN = CpN self.launcher = launcher self.batch = batch self.N = [[GpN, CpN] for i in range(N)] def launcher(self, R): if self.batch == 'srun': nodes = R.min_nodes(self.CpN, self.GpN) assert nodes is not None srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % ( nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu) if R.srun_attr is not None: srun += " %s" % R.srun_attr return srun elif self.batch == 'jsrun': jsrun = "jsrun --nrs %d -a %d -g %d -c %d" % ( R.nrs, R.tasks, R.gpu, R.cpu) if R.jsrun_attr is not None: jsrun += " %s" % R.jsrun_attr return jsrun raise IndexError(self.batch) # Returns None if this allocation is impossible to schedule. # Or a tuple (nodes, time) indicating max #nodes and avg. time taken. def possible(self, R): if R.time > self.end_time - tick(): print("Unable to schedule due to time limitations.") return None if R.cpu > self.CpN or R.gpu > self.GpN or R.cpu < 1 or R.gpu < 0: nodes = R.min_nodes(self.CpN, self.GpN) if nodes is None: print("Unable to schedule due to inavailability of high gpu/cpu count nodes.") return None n = self.CpN//R.cpu if R.gpu > 0: n = min(n, self.GpN//R.gpu) # max nrs per node nodes = (R.nrs+n-1) // n if nodes > len(self.N): print("Unable to schedule due to inavailability of sufficient nodes.") return None Loading Loading @@ -131,7 +145,7 @@ class Machine: if nrs != 0: return None cmd = getattr(R, self.launcher) cmd = self.launcher(R) if test: return cmd Loading
rules.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from machine import ResourceSet from fmatch import FMatch jobscript = """ set -e cd {dirname} {setup} %s Loading