Commit ed96fa77 authored by David M. Rogers's avatar David M. Rogers
Browse files

Tested new SLURM resource detection.

parent 4fa6dae5
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
rungpu:
  resource:
      time: 1.0
      nrs: 1 # 1 resource set
      cpu: 14
      gpu: 2
      srun_attr: "--gpu-bind=closest"
  inp: []
  out:
    - rungpu.log
  script: |
    {mpirun} ../hello_jobstep/hello_jobstep >rungpu.log
    sleep 20
+52 −0
Original line number Diff line number Diff line
# This example shows a 2-step workflow that composes
# a grompp step together with an mdrun.
run1:
  dirname: run1
  out:
      - rungpu.log

run2:
  dirname: run2
  out:
      - rungpu.log

run3:
  dirname: run3
  out:
      - rungpu.log

run4:
  dirname: run4
  out:
      - rungpu.log

run5:
  dirname: run5
  out:
      - rungpu.log

run6:
  dirname: run6
  out:
      - rungpu.log

run7:
  dirname: run7
  out:
      - rungpu.log

run8:
  dirname: run8
  out:
      - rungpu.log

run9:
  dirname: run9
  out:
      - rungpu.log

run10:
  dirname: run10
  out:
      - rungpu.log
+22 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH -A stf006
#SBATCH -t 0:10
#SBATCH -N 2
#SBATCH -J gpu_test
#SBATCH -o gpu_test.%J

### Build
# git clone https://code.ornl.gov/olcf/hello_jobstep.git
# cd hello_jobstep
# module load craype-accel-amd-gfx90a rocm
# make

### Load the python env
# module load cray-python
# python3 -m venv venv
# . venv/bin/activate
# (cd ../../ && pip install -e)

for((i=1;i<=10;i++)); do mkdir -p run$i; done
rm run[0-9]*/rungpu.log
pmake rules.yaml targets.yaml 10
+7 −3
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#      M.free(R)

from typing import Tuple
import subprocess
import os, re, time, logging
_tick = time.time
tick = lambda: _tick()/60.0
@@ -49,9 +50,11 @@ class ResourceSet:
    def min_nodes(self, CpN, GpN):
        if self.cpu > CpN or self.gpu > GpN or self.cpu < 1 or self.gpu < 0:
            return None
        # determine `n` = max nrs per node
        n = CpN // self.cpu
        if self.gpu > 0:
            n = min(n, GpN//self.gpu) # max nrs per node
            n = min(n, GpN//self.gpu)
        # nrs / max nrs per node = min nodes
        return (self.nrs+n-1) // n

    def ready(self):
@@ -79,7 +82,7 @@ def parse_scontrol_show_node(s : str) -> Tuple[int, int]:
    return ncpu, ngpu

def runcmd(*args):
    ret = subprocess.run(*args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True)
    ret = subprocess.run(args, capture_output=True, check=True, encoding="utf-8", universal_newlines=True)
    return ret.stdout

def machine(time):
@@ -118,7 +121,8 @@ class Machine:
        if self.batch == 'srun':
            nodes = R.min_nodes(self.CpN, self.GpN)
            assert nodes is not None
            srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % (
            #srun = "srun --exclusive -N %d --cpu-bind=cores --ntasks %d -c %d -G %d" % (
            srun = "srun --exclusive -N %d --ntasks %d -c %d -G %d" % (
                         nodes, R.nrs*R.tasks, R.nrs*R.cpu//nodes, R.nrs*R.gpu)
            if R.srun_attr is not None:
                srun += " %s" % R.srun_attr