Commit f70a0931 authored by Maiterth, Matthias's avatar Maiterth, Matthias
Browse files

Merge branch 'develop' into 'net-dev'

# Conflicts:
#   raps/sim_config.py
#   raps/workloads/__init__.py
parents 0c1235b0 91c2da82
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -7,3 +7,5 @@ venv
simulation_results/
models/fmu-models
.shell-completion-cache
raps-output-*
ppo_raps_logs
+8 −0
Original line number Diff line number Diff line
@@ -77,6 +77,14 @@ For MIT Supercloud
    # Reinforcement learning test case
    raps train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201

Microsoft Azure - 2017 Philly Traces

    # Synthetic
    python main.py run-parts -x philly -w multitenant

    # Telemetry replay
    python main.py run-parts -x philly -f /opt/data/philly/trace-data --start 2017-10-03T00:14:56Z  --end 2017-10-04T00:00

For Lumi

    # Synthetic test for Lumi:

config/perlmutter.yaml

0 → 100644
+51 −0
Original line number Diff line number Diff line
system:
  num_cdus: 36
  racks_per_cdu: 3
  nodes_per_rack: 128
  rectifiers_per_rack: 32
  chassis_per_rack: 8
  nodes_per_blade: 2
  switches_per_chassis: 4
  nics_per_node: 4
  rectifiers_per_chassis: 4
  nodes_per_rectifier: 4
  missing_racks: []
  down_nodes: []
  cpus_per_node: 1
  gpus_per_node: 4
  cpu_peak_flops: 3580000000000.0
  gpu_peak_flops: 9700000000000.0
  cpu_fp_ratio: 0.667
  gpu_fp_ratio: 0.667
power:
  power_gpu_idle: 88
  power_gpu_max: 300
  power_cpu_idle: 90
  power_cpu_max: 280
  power_mem: 74.26
  power_nic: 20
  power_nvme: 30
  power_switch: 250
  power_cdu: 8473.47
  power_update_freq: 15
  rectifier_peak_threshold: 13670
  sivoc_loss_constant: 13
  sivoc_efficiency: 0.98
  rectifier_loss_constant: 17
  rectifier_efficiency: 0.96
  power_cost: 0.094
scheduler:
  seed: 42
  job_arrival_time: 900
  mtbf: 11
  trace_quanta: 10
  min_wall_time: 3600
  max_wall_time: 43200
  ui_update_freq: 900
  max_nodes_per_job: 3000
  job_end_probs:
    COMPLETED: 0.63
    FAILED: 0.13
    CANCELLED: 0.12
    TIMEOUT: 0.11
    NODE_FAIL: 0.01
+51 −0
Original line number Diff line number Diff line
system:
  num_cdus: 1
  racks_per_cdu: 1
  nodes_per_rack: 321
  chassis_per_rack: 3
  nodes_per_blade: 2
  switches_per_chassis: 4
  nics_per_node: 4
  rectifiers_per_chassis: 4
  nodes_per_rectifier: 4
  missing_racks: []
  down_nodes: []
  cpus_per_node: 2
  cores_per_cpu: 20
  gpus_per_node: 2
  cpu_peak_flops: 1248000000000.0 # assume Xeon E5-2690v4 CPU 64-bit
  gpu_peak_flops: 9300000000000.0 # assume 12G P100 32-bit
  cpu_fp_ratio: 0.667
  gpu_fp_ratio: 0.667
power:
  power_gpu_idle: 30
  power_gpu_max: 250
  power_cpu_idle: 90
  power_cpu_max: 270
  power_mem: 74.26
  power_nvme: 30
  power_nic: 20
  power_cdu: 8473.47
  power_switch: 250
  power_update_freq: 15
  rectifier_peak_threshold: 13670
  sivoc_loss_constant: 13
  sivoc_efficiency: 0.98
  rectifier_loss_constant: 17
  rectifier_efficiency: 0.96
  power_cost: 0.094
scheduler:
  multitenant: true
  job_arrival_time: 900
  mtbf: 11
  trace_quanta: 20
  min_wall_time: 3600
  max_wall_time: 43200
  ui_update_freq: 900
  max_nodes_per_job: 192
  job_end_probs:
    COMPLETED: 0.63
    FAILED: 0.13
    CANCELLED: 0.12
    TIMEOUT: 0.11
    NODE_FAIL: 0.01
+51 −0
Original line number Diff line number Diff line
system:
  num_cdus: 1
  racks_per_cdu: 1
  nodes_per_rack: 231
  chassis_per_rack: 3
  nodes_per_blade: 2
  switches_per_chassis: 4
  nics_per_node: 4
  rectifiers_per_chassis: 4
  nodes_per_rectifier: 4
  missing_racks: []
  down_nodes: []
  cpus_per_node: 2
  cores_per_cpu: 20
  gpus_per_node: 8
  cpu_peak_flops: 1248000000000.0  # assume Xeon E5-2690v4 CPU 64-bit
  gpu_peak_flops: 12000000000000.0 # assume 24G P40 32-bit
  cpu_fp_ratio: 0.667
  gpu_fp_ratio: 0.667
power:
  power_gpu_idle: 50
  power_gpu_max: 250
  power_cpu_idle: 90
  power_cpu_max: 270
  power_mem: 74.26
  power_nvme: 30
  power_nic: 20
  power_cdu: 8473.47
  power_switch: 250
  power_update_freq: 15
  rectifier_peak_threshold: 13670
  sivoc_loss_constant: 13
  sivoc_efficiency: 0.98
  rectifier_loss_constant: 17
  rectifier_efficiency: 0.96
  power_cost: 0.094
scheduler:
  multitenant: true
  job_arrival_time: 900
  mtbf: 11
  trace_quanta: 20
  min_wall_time: 3600
  max_wall_time: 43200
  ui_update_freq: 900
  max_nodes_per_job: 192
  job_end_probs:
    COMPLETED: 0.63
    FAILED: 0.13
    CANCELLED: 0.12
    TIMEOUT: 0.11
    NODE_FAIL: 0.01
Loading