Merge branch 'refactor-args' into 'develop' (133ff957) · Commits · ExaDigiT / sim-raps

.flake8

+1 −1

Original line number	Diff line number	Diff line
		[flake8]
		exclude = .git, __pycache__, venv*, simulation_results, third_party
		exclude = .git, __pycache__, venv*, simulation_results, third_party, models
		max-line-length = 120

README.md

+8 −8

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ Instructions for setup and usage are given below. An online documentation of Exa

		## Setup environment

		Note: Requires python3.11 or greater.
		Note: Requires python3.12 or greater.

		pip install -e .

		@@ -30,7 +30,7 @@ Note: Requires python3.11 or greater.
		# Frontier
		DATEDIR="date=2024-01-18"
		DPATH=~/data/frontier-sample-2024-01-18
		python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR
		python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR

		## Open Telemetry dataset

		@@ -46,7 +46,7 @@ For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from

		For Google cluster trace v2

		python main.py --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -ff 600
		python main.py --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample --ff 600

		# analyze dataset
		python -m raps.telemetry --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -v
		@@ -83,7 +83,7 @@ For Lumi
		Lassen is one of the few datasets that has networking data. See `raps/dataloaders/lassen.py` for how to
		get the datasets. To run a network simulation, use the following command:

		python main.py -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -ff 365d -t 12h --arrival poisson -net
		python main.py -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --ff 365d -t 12h --arrival poisson --net

		## Snapshot of extracted workload data

		@@ -140,10 +140,10 @@ This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename
		There are three ways to modify replaying of telemetry data:

		1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized.
		python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --arrival poisson
		python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson
		2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler.

		python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h
		python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h

		3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system.

		@@ -151,11 +151,11 @@ python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --pol

		## Job-level power output example for replay of single job

		python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --jid 1234567 -o
		python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --jid 1234567 -o

		## Compute stats on telemetry data, e.g., average job arrival time

		python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR
		python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR

		## Build and run Docker container

main.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@ import math
		#
		from raps.helpers import check_python_version
		#
		from raps.config import get_system_config
		from raps.system_config import get_system_config
		from raps.constants import OUTPUT_PATH, SEED
		from raps.cooling import ThermoFluidsModel
		from raps.ui import LayoutManager
		@@ -45,7 +45,7 @@ from raps.stats import (
		print_formatted_report
		)

		from raps.args import args, args_dict
		from raps.sim_config import args, args_dict

		check_python_version()

multi-part-sim-mpi.py

+8 −20

Original line number	Diff line number	Diff line
		@@ -8,18 +8,16 @@ stats for heterogeneous systems (e.g., LUMI, Setonix, Adastra).

		from tqdm import tqdm
		from mpi4py import MPI
		from raps.utils import convert_to_seconds, next_arrival
		from raps.utils import next_arrival
		from raps.workload import Workload
		from raps.telemetry import Telemetry
		from raps.power import PowerManager, compute_node_power
		from raps.flops import FLOPSManager
		from raps.engine import Engine
		from raps.ui import LayoutManager
		from raps.config import get_system_config, CONFIG_PATH
		from args import args
		from raps.system_config import get_partition_configs
		from raps.sim_config import args
		import random
		import os
		import glob
		from raps.helpers import check_python_version
		check_python_version()

		@@ -29,20 +27,10 @@ def main():
		rank = comm.Get_rank()
		size = comm.Get_size()

		# 1) Expand “partitions” (on rank 0) if the user used a glob:
		if rank == 0:
		partition_names = args.partitions
		if '*' in partition_names[0]:
		paths = glob.glob(os.path.join(CONFIG_PATH, partition_names[0]))
		partition_names = [os.path.join(*p.split(os.sep)[-2:]) for p in paths]
		else:
		partition_names = None

		# 2) Broadcast the final list of partition_names to everyone
		partition_names = comm.bcast(partition_names, root=0)

		# 3) Load configs for every partition (all ranks do this)
		configs = [get_system_config(p).get_legacy() for p in partition_names]
		multi_config = get_partition_configs(args.partitions)
		partition_names = multi_config.partition_names
		configs = [c.get_legacy() for c in multi_config.partitions]
		args_dicts = [{**vars(args), 'config': cfg} for cfg in configs]

		# 4) Each rank decides which partition‐indices it owns (round-robin):
		@@ -122,12 +110,12 @@ def main():

		# 9) Compute timestep_start / timestep_end (all ranks agree):
		if args.fastforward:
		fastforward = convert_to_seconds(args.fastforward)
		fastforward = args.fastforward
		else:
		fastforward = 0

		if args.time:
		timesteps = convert_to_seconds(args.time)
		timesteps = args.time
		else:
		timesteps = 88200 # default 24 hours

multi-part-sim.py

+10 −16

Original line number	Diff line number	Diff line
		@@ -9,33 +9,27 @@ statistics for systems such as MIT Supercloud, Setonix, Adastra, and LUMI.

		from tqdm import tqdm
		from raps.stats import get_engine_stats, get_job_stats, get_scheduler_stats, get_network_stats
		from raps.utils import convert_to_seconds, next_arrival
		from raps.utils import next_arrival
		from raps.workload import Workload
		from raps.telemetry import Telemetry
		from raps.power import PowerManager, compute_node_power
		from raps.flops import FLOPSManager
		from raps.engine import Engine
		from raps.ui import LayoutManager
		from raps.config import get_system_config, CONFIG_PATH
		from raps.args import args
		from raps.system_config import get_partition_configs
		from raps.sim_config import args
		import random
		import os
		import glob
		from raps.helpers import check_python_version
		check_python_version()


		# Load configurations for each partition
		partition_names = args.partitions
		multi_config = get_partition_configs(args.partitions)
		partition_names = multi_config.partition_names
		configs = [c.get_legacy() for c in multi_config.partitions]
		args.system = multi_config.system_name

		print(args.partitions)
		if '*' in args.partitions[0]:
		paths = glob.glob(os.path.join(CONFIG_PATH, args.partitions[0].replace("'", "")))
		partition_names = [os.path.join(*p.split(os.sep)[-2:]) for p in paths]

		args.system = partition_names[0].split(os.sep)[0]

		configs = [get_system_config(partition).get_legacy() for partition in partition_names]
		args_dicts = [
		{**vars(args), 'config': config, 'partition': partition_names[i]}
		for i, config in enumerate(configs)
		@@ -123,11 +117,11 @@ for i, (config, ad) in enumerate(zip(configs, args_dicts)):

		# Set simulation timesteps
		if args.fastforward:
		fastfoward = convert_to_seconds(args.fastforward)
		fastfoward = args.fastforward
		else:
		fastforward = 0
		if args.time:
		timesteps = convert_to_seconds(args.time)
		timesteps = args.time
		else:
		timesteps = 88200 # Default to 24 hours

		@@ -135,7 +129,7 @@ timestep_start = fastforward
		timestep_end = timestep_start + timesteps

		if args.time_delta:
		time_delta = convert_to_seconds(args.time_delta)
		time_delta = args.time_delta
		else:
		time_delta = config['TRACE_QUANTA']

Admin message