Loading raps/dataloaders/mit_supercloud/loader.py +11 −17 Original line number Diff line number Diff line Loading @@ -116,9 +116,10 @@ import re from tqdm import tqdm from typing import Dict, Union, Optional from collections import Counter from datetime import datetime, timezone from raps.job import job_dict, Job from raps.utils import summarize_ranges, next_arrival from raps.utils import summarize_ranges, next_arrival, WorkloadData from .utils import proc_cpu_series, proc_gpu_series, to_epoch from .utils import DEFAULT_START, DEFAULT_END Loading Loading @@ -585,21 +586,10 @@ def load_data(local_dataset_path, **kwargs): cpu_peak = cpu_cores_req / cores_per_cpu / cpus_per_node # Is this per CPU? cpu_tr = [min(x/cores_per_cpu/cpus_per_node, cpu_peak) for x in cpu_tr] if arrival == "poisson": job_arrival_time = config.get("JOB_ARRIVAL_TIME") submit_time = next_arrival(1 / job_arrival_time) start_time = submit_time end_time = None scheduled_nodes = None telemetry_start = 0 telemetry_end = 86640 else: # replay start_time = t0 - start_ts end_time = t1 - start_ts submit_time = rec.get("time_submit") - start_ts scheduled_nodes = rec.get("scheduled_nodes") telemetry_start = int(sl.time_start.min()) telemetry_end = int(sl.time_end.max()) current_job_dict = job_dict( nodes_required=nr, Loading Loading @@ -642,4 +632,8 @@ def load_data(local_dataset_path, **kwargs): for reason, count in skip_counts.items(): print(f"- {reason}: {count}") return jobs_list, telemetry_start, telemetry_end # min_overall_utime, max_overall_utime, args_namespace return WorkloadData( jobs=jobs_list, telemetry_start=0, telemetry_end=int(end_ts - start_ts), start_date=datetime.fromtimestamp(start_ts, timezone.utc), ) raps/multi_part_engine.py +4 −1 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ class MultiPartEngine: workloads_by_partition: dict[str, WorkloadData] = {} engines: dict[str, Engine] = {} timestep_start, timestep_end, time_delta = 0, 0, 0 time_delta = 0 for partition in sim_config.system_configs: name = partition.system_name engine, workload_data, time_delta = Engine.from_sim_config( Loading @@ -31,6 +31,9 @@ class MultiPartEngine: job.partition = name workloads_by_partition[name] = workload_data engines[name] = engine timestep_start = min(w.telemetry_start for w in workloads_by_partition.values()) timestep_end = min(w.telemetry_end for w in workloads_by_partition.values()) total_initial_jobs = sum(len(j.jobs) for j in workloads_by_partition.values()) for engine in engines.values(): engine.total_initial_jobs = total_initial_jobs Loading raps/run_sim.py +0 −6 Original line number Diff line number Diff line Loading @@ -245,12 +245,6 @@ def run_multi_part_sim(sim_config: SimConfig): multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) # TODO: The mit_supercloud dataloader seems to be outputting the wrong timesteps? mit_supercloud # is the only multi-partition system with replay, so just manually overriding the timesteps here # to fix it for now. The original multi-part-sim.py always started from timestep 0 as well. timestep_end = timestep_end - timestep_start timestep_start = 0 if sim_config.output: for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot( Loading Loading
raps/dataloaders/mit_supercloud/loader.py +11 −17 Original line number Diff line number Diff line Loading @@ -116,9 +116,10 @@ import re from tqdm import tqdm from typing import Dict, Union, Optional from collections import Counter from datetime import datetime, timezone from raps.job import job_dict, Job from raps.utils import summarize_ranges, next_arrival from raps.utils import summarize_ranges, next_arrival, WorkloadData from .utils import proc_cpu_series, proc_gpu_series, to_epoch from .utils import DEFAULT_START, DEFAULT_END Loading Loading @@ -585,21 +586,10 @@ def load_data(local_dataset_path, **kwargs): cpu_peak = cpu_cores_req / cores_per_cpu / cpus_per_node # Is this per CPU? cpu_tr = [min(x/cores_per_cpu/cpus_per_node, cpu_peak) for x in cpu_tr] if arrival == "poisson": job_arrival_time = config.get("JOB_ARRIVAL_TIME") submit_time = next_arrival(1 / job_arrival_time) start_time = submit_time end_time = None scheduled_nodes = None telemetry_start = 0 telemetry_end = 86640 else: # replay start_time = t0 - start_ts end_time = t1 - start_ts submit_time = rec.get("time_submit") - start_ts scheduled_nodes = rec.get("scheduled_nodes") telemetry_start = int(sl.time_start.min()) telemetry_end = int(sl.time_end.max()) current_job_dict = job_dict( nodes_required=nr, Loading Loading @@ -642,4 +632,8 @@ def load_data(local_dataset_path, **kwargs): for reason, count in skip_counts.items(): print(f"- {reason}: {count}") return jobs_list, telemetry_start, telemetry_end # min_overall_utime, max_overall_utime, args_namespace return WorkloadData( jobs=jobs_list, telemetry_start=0, telemetry_end=int(end_ts - start_ts), start_date=datetime.fromtimestamp(start_ts, timezone.utc), )
raps/multi_part_engine.py +4 −1 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ class MultiPartEngine: workloads_by_partition: dict[str, WorkloadData] = {} engines: dict[str, Engine] = {} timestep_start, timestep_end, time_delta = 0, 0, 0 time_delta = 0 for partition in sim_config.system_configs: name = partition.system_name engine, workload_data, time_delta = Engine.from_sim_config( Loading @@ -31,6 +31,9 @@ class MultiPartEngine: job.partition = name workloads_by_partition[name] = workload_data engines[name] = engine timestep_start = min(w.telemetry_start for w in workloads_by_partition.values()) timestep_end = min(w.telemetry_end for w in workloads_by_partition.values()) total_initial_jobs = sum(len(j.jobs) for j in workloads_by_partition.values()) for engine in engines.values(): engine.total_initial_jobs = total_initial_jobs Loading
raps/run_sim.py +0 −6 Original line number Diff line number Diff line Loading @@ -245,12 +245,6 @@ def run_multi_part_sim(sim_config: SimConfig): multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) # TODO: The mit_supercloud dataloader seems to be outputting the wrong timesteps? mit_supercloud # is the only multi-partition system with replay, so just manually overriding the timesteps here # to fix it for now. The original multi-part-sim.py always started from timestep 0 as well. timestep_end = timestep_end - timestep_start timestep_start = 0 if sim_config.output: for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot( Loading