Commit c6d23273 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Merge branch 'fix-running-time' into 'hpl'

Fix running time

See merge request exadigit/raps!131
parents fea99646 81070941
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -499,7 +499,7 @@ class Engine:
        # update Running time
        for job in self.running:
            if job.current_state == JobState.RUNNING:
                job.running_time = self.current_timestep - job.start_time
                job.current_run_time = self.current_timestep - job.start_time

        # Stop the simulation if no more jobs are running or in the queue or in the job list.
        if autoshutdown and \
@@ -552,7 +552,7 @@ class Engine:

        for job in self.running:

            job.running_time = self.current_timestep - job.start_time
            job.current_run_time = self.current_timestep - job.start_time

            if job.current_state != JobState.RUNNING:
                raise ValueError(
@@ -561,15 +561,15 @@ class Engine:
                )
            else:  # if job.state == JobState.RUNNING:
                # Error checks
                if not replay and job.running_time > job.time_limit and job.end_time is not None:
                if not replay and job.current_run_time > job.time_limit and job.end_time is not None:
                    raise Exception(f"Job exceded time limit! "
                                    f"{job.running_time} > {job.time_limit}"
                                    f"{job.current_run_time} > {job.time_limit}"
                                    f"\n{job}"
                                    f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
                                    )
                if replay and job.running_time > job.expected_run_time:
                if replay and job.current_run_time > job.expected_run_time:
                    raise Exception(f"Job should have ended in replay! "
                                    f" {job.running_time} > {job.expected_run_time}"
                                    f" {job.current_run_time} > {job.expected_run_time}"
                                    f"\n{job}"
                                    f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
                                    )
+3 −3
Original line number Diff line number Diff line
@@ -180,7 +180,7 @@ class Job:
        self.trace_start_time = None  # Relative start time of the trace (to running time)
        self.trace_end_time = None    # Relative end time of the trace
        self.trace_quanta = None  # Trace quanta associated with the job # None means single value!
        self.running_time = 0     # Current running time updated when simulating
        self.current_run_time = 0     # Current running time updated when simulating

        # If a job dict was given, override the values from the job_dict:
        for key, value in job_dict.items():
@@ -232,7 +232,7 @@ class Job:
                f"trace_start_time={self.trace_start_time}, "
                f"trace_end_time={self.trace_end_time}, "
                f"trace_quanta={self.trace_quanta}, "
                f"running_time={self.running_time}, "
                f"current_run_time={self.current_run_time}, "
                f"power={self.power}, "
                f"power_history={self.power_history})")

@@ -296,7 +296,7 @@ class JobStatistics:
        self.account = job.account
        self.num_nodes = len(job.scheduled_nodes)
        self.scheduled_nodes = job.scheduled_nodes
        self.run_time = job.running_time
        self.run_time = job.current_run_time
        self.submit_time = job.submit_time
        self.start_time = job.start_time
        self.end_time = job.end_time
+11 −7
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from raps.utils import get_current_utilization
from raps.network.fat_tree import node_id_to_host_name
from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index


def debug_print_trace(job, label: str = ""):
    """Print either the length (if iterable) or the value of job.gpu_trace."""
    if hasattr(job.gpu_trace, "__len__"):
@@ -138,6 +139,7 @@ def worst_link_util(loads, throughput):
            max_util = util
    return max_util


def get_link_util_stats(loads, throughput, top_n=10):
    """
    Calculates a distribution of link utilization stats.
@@ -164,11 +166,13 @@ def get_link_util_stats(loads, throughput, top_n=10):

    return stats


def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float:
    """Return bytes-per-tick throughput of a single link."""
    bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9
    return float(bw) * trace_quanta


def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
    """
    Simulates network congestion from a list of concurrently running jobs.
@@ -181,8 +185,8 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
    trace_quanta = jobs[0].trace_quanta if jobs else 0

    for job in jobs:
        # Assuming job.running_time is 0 for this static simulation
        job.running_time = 0
        # Assuming job.current_run_time is 0 for this static simulation
        job.current_run_time = 0
        job.trace_start_time = 0
        net_tx = get_current_utilization(job.ntx_trace, job)

+1 −2
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ def compute_node_power(cpu_util, gpu_util, net_util, config):
    power_gpu = gpu_util * config['POWER_GPU_MAX'] + \
        (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE']

    if config.get("POWER_NIC_IDLE") != None and config.get("POWER_NIC_MAX") != None:
    if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None:
        power_nic = config['POWER_NIC_IDLE'] + \
            (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util
    else:
@@ -432,7 +432,6 @@ class PowerManager:
        jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils)

        for i, job in enumerate(running_jobs):
            # if job.running_time % self.config['TRACE_QUANTA'] == 0:
            job.power_history.append(jobs_power[i] * len(job.scheduled_nodes))

        # Update the power array UI component
+5 −5
Original line number Diff line number Diff line
@@ -191,10 +191,10 @@ class LayoutManager:
                nodes_display = col_nodelist

            if self.engine.downscale != 1:
                running_time_str = convert_seconds_to_hhmmss(job.running_time // self.engine.downscale) + \
                    f" +{job.running_time % self.engine.downscale}/{self.engine.downscale}s"
                running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \
                    f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s"
            else:
                running_time_str = convert_seconds_to_hhmm(job.running_time)
                running_time_str = convert_seconds_to_hhmm(job.current_run_time)

            row = [
                str(job.id).zfill(5),
@@ -269,13 +269,13 @@ class LayoutManager:
        # Add data row with white values
        time_in_s = time // self.engine.downscale
        if (time_in_s < 946684800):  # Introducing Y2K into our codebase! Kek
            time_str = convert_seconds_to_hhmm(time_in_s)
            time_str = convert_seconds_to_hhmmss(time_in_s)
        else:
            # For the curious: If the simulation time in seconds is large than
            # unix timestamp for Jan 2000 this is a unix timestamp,
            time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}"
        if timestep_start != 0:  # append time simulated
            time_str += f"\nSim: {convert_seconds_to_hhmm(time_in_s - timestep_start)}"
            time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}"

        row.append(time_str)
        row.append(str(nrun))
Loading