Loading examples/srun_gpu/rules.yaml 0 → 100644 +13 −0 Original line number Diff line number Diff line rungpu: resource: time: 1.0 nrs: 1 # 1 resource set cpu: 14 gpu: 2 srun_attr: "--gpu-bind=closest" inp: [] out: - rungpu.log script: | {mpirun} ../hello_jobstep/hello_jobstep >rungpu.log sleep 20 examples/srun_gpu/targets.yaml 0 → 100644 +52 −0 Original line number Diff line number Diff line # This example shows a 2-step workflow that composes # a grompp step together with an mdrun. run1: dirname: run1 out: - rungpu.log run2: dirname: run2 out: - rungpu.log run3: dirname: run3 out: - rungpu.log run4: dirname: run4 out: - rungpu.log run5: dirname: run5 out: - rungpu.log run6: dirname: run6 out: - rungpu.log run7: dirname: run7 out: - rungpu.log run8: dirname: run8 out: - rungpu.log run9: dirname: run9 out: - rungpu.log run10: dirname: run10 out: - rungpu.log examples/srun_gpu/test.sh 0 → 100644 +22 −0 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A stf006 #SBATCH -t 0:10 #SBATCH -N 2 #SBATCH -J gpu_test #SBATCH -o gpu_test.%J ### Build # git clone https://code.ornl.gov/olcf/hello_jobstep.git # cd hello_jobstep # module load craype-accel-amd-gfx90a rocm # make ### Load the python env # module load cray-python # python3 -m venv venv # . venv/bin/activate # (cd ../../ && pip install -e) for((i=1;i<=10;i++)); do mkdir -p run$i; done rm run[0-9]*/rungpu.log pmake rules.yaml targets.yaml 10 src/pmake/machine.py +7 −3 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ # M.free(R) from typing import Tuple import subprocess import os, re, time, logging _tick = time.time tick = lambda: _tick()/60.0 Loading Loading @@ -49,9 +50,11 @@ class ResourceSet: def min_nodes(self, CpN, GpN): if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0: return None # determine `n` = max nrs per node n = CpN // self.cpu if self.gpu > 0: n = min(n, GpN//self.gpu) # max nrs per node n = min(n, GpN//self.gpu) # nrs / max nrs per node = min nodes return (self.nrs+n-1) // n def ready(self): Loading Loading @@ -79,7 +82,7 @@ def parse_scontrol_show_node(s : str) -> Tuple[int, int]: return ncpu, ngpu def runcmd(*args): ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) ret = subprocess.run(args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) return ret.stdout def machine(time): Loading Loading @@ -118,7 +121,8 @@ class Machine: if self.batch == 'srun': nodes = R.min_nodes(self.CpN, self.GpN) assert nodes is not None srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % ( #srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % ( srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % ( nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu) if R.srun_attr is not None: srun += " %s" % R.srun_attr Loading Loading
examples/srun_gpu/rules.yaml 0 → 100644 +13 −0 Original line number Diff line number Diff line rungpu: resource: time: 1.0 nrs: 1 # 1 resource set cpu: 14 gpu: 2 srun_attr: "--gpu-bind=closest" inp: [] out: - rungpu.log script: | {mpirun} ../hello_jobstep/hello_jobstep >rungpu.log sleep 20
examples/srun_gpu/targets.yaml 0 → 100644 +52 −0 Original line number Diff line number Diff line # This example shows a 2-step workflow that composes # a grompp step together with an mdrun. run1: dirname: run1 out: - rungpu.log run2: dirname: run2 out: - rungpu.log run3: dirname: run3 out: - rungpu.log run4: dirname: run4 out: - rungpu.log run5: dirname: run5 out: - rungpu.log run6: dirname: run6 out: - rungpu.log run7: dirname: run7 out: - rungpu.log run8: dirname: run8 out: - rungpu.log run9: dirname: run9 out: - rungpu.log run10: dirname: run10 out: - rungpu.log
examples/srun_gpu/test.sh 0 → 100644 +22 −0 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A stf006 #SBATCH -t 0:10 #SBATCH -N 2 #SBATCH -J gpu_test #SBATCH -o gpu_test.%J ### Build # git clone https://code.ornl.gov/olcf/hello_jobstep.git # cd hello_jobstep # module load craype-accel-amd-gfx90a rocm # make ### Load the python env # module load cray-python # python3 -m venv venv # . venv/bin/activate # (cd ../../ && pip install -e) for((i=1;i<=10;i++)); do mkdir -p run$i; done rm run[0-9]*/rungpu.log pmake rules.yaml targets.yaml 10
src/pmake/machine.py +7 −3 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ # M.free(R) from typing import Tuple import subprocess import os, re, time, logging _tick = time.time tick = lambda: _tick()/60.0 Loading Loading @@ -49,9 +50,11 @@ class ResourceSet: def min_nodes(self, CpN, GpN): if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0: return None # determine `n` = max nrs per node n = CpN // self.cpu if self.gpu > 0: n = min(n, GpN//self.gpu) # max nrs per node n = min(n, GpN//self.gpu) # nrs / max nrs per node = min nodes return (self.nrs+n-1) // n def ready(self): Loading Loading @@ -79,7 +82,7 @@ def parse_scontrol_show_node(s : str) -> Tuple[int, int]: return ncpu, ngpu def runcmd(*args): ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) ret = subprocess.run(args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True) return ret.stdout def machine(time): Loading Loading @@ -118,7 +121,8 @@ class Machine: if self.batch == 'srun': nodes = R.min_nodes(self.CpN, self.GpN) assert nodes is not None srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % ( #srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % ( srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % ( nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu) if R.srun_attr is not None: srun += " %s" % R.srun_attr Loading