diff --git a/experiments/mit-replay-24hrs.yaml b/experiments/mit-replay-24hrs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1357886f9d02cdb61e915e1f56b5711a4bf0fde0 --- /dev/null +++ b/experiments/mit-replay-24hrs.yaml @@ -0,0 +1,6 @@ +# python main.py run-multi-part experiments/mit-replay-24hrs.yaml +partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] +replay: + - /opt/data/mit_supercloud/202201 +start: 2021-05-21T00:00 +end: 2021-05-22T00:00 diff --git a/experiments/mit-synthetic.yaml b/experiments/mit-synthetic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f68cd18030cc1a4ba1b3918b880083f9c275f94 --- /dev/null +++ b/experiments/mit-synthetic.yaml @@ -0,0 +1,3 @@ +# python main.py run-multi-part experiments/mit-synthetic.yaml +partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] +workload: multitenant diff --git a/experiments/mit.yaml b/experiments/mit.yaml deleted file mode 100644 index 77815d4e246353a00c675a951ed327fbb0426264..0000000000000000000000000000000000000000 --- a/experiments/mit.yaml +++ /dev/null @@ -1,6 +0,0 @@ -system: mit_supercloud -partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] -replay: - - /opt/data/mit_supercloud -start: 2021-05-21T13:00 -end: 2021-05-21T14:00 diff --git a/raps/dataloaders/mit_supercloud/loader.py b/raps/dataloaders/mit_supercloud/loader.py index e3103ba93bb35b8d3ec0a47bd462382340c54dc8..2c8dbc18a7669f41abb3d2b63b89068407f80783 100644 --- a/raps/dataloaders/mit_supercloud/loader.py +++ b/raps/dataloaders/mit_supercloud/loader.py @@ -117,8 +117,9 @@ from tqdm import tqdm from typing import Dict, Union, Optional from collections import Counter from datetime import datetime, timezone + from raps.job import job_dict, Job -from raps.utils import summarize_ranges, WorkloadData +from raps.utils import summarize_ranges, next_arrival, WorkloadData from .utils import proc_cpu_series, proc_gpu_series, to_epoch from .utils import DEFAULT_START, DEFAULT_END @@ -209,6 +210,8 @@ def load_data(local_dataset_path, **kwargs): jobs_list, sim_start_time, sim_end_time """ debug = kwargs.get("debug") + config = kwargs.get("config") + arrival = kwargs.get("arrival") NL_PATH = os.path.dirname(__file__) skip_counts = Counter() @@ -300,6 +303,17 @@ def load_data(local_dataset_path, **kwargs): cpu_only = (part == "part-cpu") mixed = (part == "part-gpu") + # handle single-partition configs (e.g., mit_supercloud.yaml) + if not cpu_only and not mixed: + gpus_per_node = config.get("GPUS_PER_NODE") + + if gpus_per_node == 0: + cpu_only = True + part = "part-cpu" + else: + mixed = True + part = "part-gpu" + # create nodelist mapping if cpu_only: with open(os.path.join(NL_PATH, "cpu_nodes.txt")) as f: @@ -516,7 +530,6 @@ def load_data(local_dataset_path, **kwargs): jobs_list = [] # Get CPUS_PER_NODE and GPUS_PER_NODE from config - config = kwargs.get('config', {}) cpus_per_node = config.get('CPUS_PER_NODE') cores_per_cpu = config.get('CORES_PER_CPU') # gpus_per_node = config.get('GPUS_PER_NODE') # Unused @@ -573,7 +586,10 @@ def load_data(local_dataset_path, **kwargs): cpu_peak = cpu_cores_req / cores_per_cpu / cpus_per_node # Is this per CPU? cpu_tr = [min(x/cores_per_cpu/cpus_per_node, cpu_peak) for x in cpu_tr] - submit_time = rec.get("time_submit", t0) - start_ts + start_time = t0 - start_ts + end_time = t1 - start_ts + submit_time = rec.get("time_submit") - start_ts + scheduled_nodes = rec.get("scheduled_nodes") current_job_dict = job_dict( nodes_required=nr, @@ -587,12 +603,12 @@ def load_data(local_dataset_path, **kwargs): nrx_trace=[], end_state=rec.get("state_end", "unknown"), id=jid, - scheduled_nodes=rec.get("scheduled_nodes"), + scheduled_nodes=scheduled_nodes, priority=rec.get("priority", 0), submit_time=submit_time, - time_limit=rec.get("time_limit", 0), - start_time=t0 - start_ts, - end_time=t1 - start_ts, + time_limit=rec.get("timelimit") * 60, + start_time=start_time, + end_time=end_time, expected_run_time=max(0, t1-t0), trace_time=len(cpu_tr)*quanta, trace_start_time=0, @@ -602,6 +618,16 @@ def load_data(local_dataset_path, **kwargs): job = Job(current_job_dict) jobs_list.append(job) + # Calculate min_overall_utime and max_overall_utime + # min_overall_utime = int(sl.time_submit.min()) + # max_overall_utime = int(sl.time_submit.max()) + + # args_namespace = SimpleNamespace( + # fastforward=min_overall_utime, + # system='mit_supercloud', + # time=max_overall_utime + # ) + print("\nSkipped jobs summary:") for reason, count in skip_counts.items(): print(f"- {reason}: {count}") diff --git a/raps/multi_part_engine.py b/raps/multi_part_engine.py index f211b8544d02d9cecabace278ee9483c34df2792..944ced9a31758c843e0d168a4c62a4144a0dbdac 100644 --- a/raps/multi_part_engine.py +++ b/raps/multi_part_engine.py @@ -21,7 +21,7 @@ class MultiPartEngine: workloads_by_partition: dict[str, WorkloadData] = {} engines: dict[str, Engine] = {} - timestep_start, timestep_end, time_delta = 0, 0, 0 + time_delta = 0 for partition in sim_config.system_configs: name = partition.system_name engine, workload_data, time_delta = Engine.from_sim_config( @@ -31,6 +31,9 @@ class MultiPartEngine: job.partition = name workloads_by_partition[name] = workload_data engines[name] = engine + timestep_start = min(w.telemetry_start for w in workloads_by_partition.values()) + timestep_end = min(w.telemetry_end for w in workloads_by_partition.values()) + total_initial_jobs = sum(len(j.jobs) for j in workloads_by_partition.values()) for engine in engines.values(): engine.total_initial_jobs = total_initial_jobs diff --git a/raps/run_sim.py b/raps/run_sim.py index ceb80a9a428f2850b3eb42a48fb86ebb5733e2b9..402aceaff3c7877647f31c7bf8ce8783174aa1f6 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -245,12 +245,6 @@ def run_multi_part_sim(sim_config: SimConfig): multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) - # TODO: The mit_supercloud dataloader seems to be outputting the wrong timesteps? mit_supercloud - # is the only multi-partition system with replay, so just manually overriding the timesteps here - # to fix it for now. The original multi-part-sim.py always started from timestep 0 as well. - timestep_end = timestep_end - timestep_start - timestep_start = 0 - if sim_config.output: for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot(