Loading raps/dataloaders/fugaku.py +3 −2 Original line number Diff line number Diff line Loading @@ -10,7 +10,7 @@ The --reschedule will compute submit times from Poisson distribution, instead of using the submit times given in F-Data. python main.py --system fugaku -f /path/to/21_04.parquet --reschedule --validate --reschedule python main.py --system fugaku -f /path/to/21_04.parquet --reschedule --validate """ import pandas as pd Loading Loading @@ -53,6 +53,7 @@ def load_data_from_df(df, **kwargs): reschedule = kwargs.get('reschedule') validate = kwargs.get('validate') jid = kwargs.get('jid', '*') config = kwargs.get('config') if fastforward: print(f"fast-forwarding {fastforward} seconds") Loading Loading @@ -81,7 +82,7 @@ def load_data_from_df(df, **kwargs): scheduled_nodes = None submit_time = row['adt'] if 'adt' in df.columns else earliest_submit_time if reschedule: # Let the scheduler reschedule the jobs time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: time_offset = (submit_time - earliest_submit_time).total_seconds() # Compute time offset in seconds Loading raps/dataloaders/lassen.py +6 −7 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ Usage Instructions: # to analyze dataset python -m raps.telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen -v # to simulate the dataset # to simulate the dataset as submitted python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen # to reschedule Loading Loading @@ -55,7 +55,6 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): Loads data from pandas DataFrames and returns the extracted job info. """ config = kwargs.get('config') globals().update(config) jid = kwargs.get('jid', '*') reschedule = kwargs.get('reschedule') fastforward = kwargs.get('fastforward') Loading Loading @@ -85,7 +84,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): nodes_required = row['num_nodes'] wall_time = compute_wall_time(row['begin_time'], row['end_time']) samples = math.ceil(wall_time / TRACE_QUANTA) samples = math.ceil(wall_time / config['TRACE_QUANTA']) # Compute GPU power gpu_energy = node_data['gpu_energy'].sum() # Joules Loading @@ -95,8 +94,8 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): #gpu_power = gpu_energy / wall_time gpu_power_array = np.array([gpu_power] * samples) gpu_min_power = nodes_required * POWER_GPU_IDLE gpu_max_power = nodes_required * POWER_GPU_MAX gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] gpu_max_power = nodes_required * config['POWER_GPU_MAX'] gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) # GPU power can be 0: # Utilization is defined in the range of [0 to GPUS_PER_NODE]. Loading @@ -106,7 +105,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): # Compute CPU power from CPU usage time # CPU usage is reported per core, while we need it in the range [0 to CPUS_PER_NODE] cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / CORES_PER_CPU # seconds cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / config['CORES_PER_CPU'] # seconds cpu_usage_array = np.array([cpu_usage] * samples) cpu_util = cpu_usage_array / wall_time cpu_trace = cpu_util # * CPUS_PER_NODE Loading @@ -122,7 +121,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): if reschedule: # Let the scheduler reschedule the jobs scheduled_nodes = None time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: scheduled_nodes = get_scheduled_nodes(row['allocation_id'], node_df) time_offset = compute_time_offset(row['begin_time'], earliest_begin_time) Loading raps/dataloaders/marconi100.py +11 −12 Original line number Diff line number Diff line Loading @@ -57,7 +57,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): The list of parsed jobs. """ config = kwargs.get('config') globals().update(config) min_time = kwargs.get('min_time', None) reschedule = kwargs.get('reschedule') fastforward = kwargs.get('fastforward') Loading Loading @@ -104,10 +103,10 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): else: cpu_power = jobs_df.loc[jidx, 'cpu_power_consumption'] cpu_power_array = cpu_power.tolist() cpu_min_power = nodes_required * POWER_CPU_IDLE * CPUS_PER_NODE cpu_max_power = nodes_required * POWER_CPU_MAX * CPUS_PER_NODE cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) cpu_trace = cpu_util * CPUS_PER_NODE cpu_trace = cpu_util * config['CPUS_PER_NODE'] node_power = (jobs_df.loc[jidx, 'node_power_consumption']).tolist() mem_power = (jobs_df.loc[jidx, 'mem_power_consumption']).tolist() Loading @@ -119,18 +118,18 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): mem_power = mem_power[:min_length] gpu_power = (node_power - cpu_power - mem_power - ([nodes_required * NICS_PER_NODE * POWER_NIC] * len(node_power)) - ([nodes_required * POWER_NVME] * len(node_power))) - ([nodes_required * config['NICS_PER_NODE'] * config['POWER_NIC']] * len(node_power)) - ([nodes_required * config['POWER_NVME']] * len(node_power))) gpu_power_array = gpu_power.tolist() gpu_min_power = nodes_required * POWER_GPU_IDLE * GPUS_PER_NODE gpu_max_power = nodes_required * POWER_GPU_MAX * GPUS_PER_NODE gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] gpu_max_power = nodes_required * config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) gpu_trace = gpu_util * GPUS_PER_NODE gpu_trace = gpu_util * config['GPUS_PER_NODE'] priority = int(jobs_df.loc[jidx, 'priority']) # wall_time = jobs_df.loc[i, 'run_time'] wall_time = gpu_trace.size * TRACE_QUANTA # seconds wall_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds end_state = jobs_df.loc[jidx, 'job_state'] time_start = jobs_df.loc[jidx+1, 'start_time'] diff = time_start - time_zero Loading @@ -139,13 +138,13 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): time_offset = max(diff.total_seconds(), 0) else: # When extracting out a single job, run one iteration past the end of the job time_offset = UI_UPDATE_FREQ time_offset = config['UI_UPDATE_FREQ'] if fastforward: time_offset -= fastforward if reschedule: # Let the scheduler reschedule the jobs scheduled_nodes = None time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: # Prescribed replay scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() Loading Loading
raps/dataloaders/fugaku.py +3 −2 Original line number Diff line number Diff line Loading @@ -10,7 +10,7 @@ The --reschedule will compute submit times from Poisson distribution, instead of using the submit times given in F-Data. python main.py --system fugaku -f /path/to/21_04.parquet --reschedule --validate --reschedule python main.py --system fugaku -f /path/to/21_04.parquet --reschedule --validate """ import pandas as pd Loading Loading @@ -53,6 +53,7 @@ def load_data_from_df(df, **kwargs): reschedule = kwargs.get('reschedule') validate = kwargs.get('validate') jid = kwargs.get('jid', '*') config = kwargs.get('config') if fastforward: print(f"fast-forwarding {fastforward} seconds") Loading Loading @@ -81,7 +82,7 @@ def load_data_from_df(df, **kwargs): scheduled_nodes = None submit_time = row['adt'] if 'adt' in df.columns else earliest_submit_time if reschedule: # Let the scheduler reschedule the jobs time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: time_offset = (submit_time - earliest_submit_time).total_seconds() # Compute time offset in seconds Loading
raps/dataloaders/lassen.py +6 −7 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ Usage Instructions: # to analyze dataset python -m raps.telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen -v # to simulate the dataset # to simulate the dataset as submitted python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen # to reschedule Loading Loading @@ -55,7 +55,6 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): Loads data from pandas DataFrames and returns the extracted job info. """ config = kwargs.get('config') globals().update(config) jid = kwargs.get('jid', '*') reschedule = kwargs.get('reschedule') fastforward = kwargs.get('fastforward') Loading Loading @@ -85,7 +84,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): nodes_required = row['num_nodes'] wall_time = compute_wall_time(row['begin_time'], row['end_time']) samples = math.ceil(wall_time / TRACE_QUANTA) samples = math.ceil(wall_time / config['TRACE_QUANTA']) # Compute GPU power gpu_energy = node_data['gpu_energy'].sum() # Joules Loading @@ -95,8 +94,8 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): #gpu_power = gpu_energy / wall_time gpu_power_array = np.array([gpu_power] * samples) gpu_min_power = nodes_required * POWER_GPU_IDLE gpu_max_power = nodes_required * POWER_GPU_MAX gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] gpu_max_power = nodes_required * config['POWER_GPU_MAX'] gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) # GPU power can be 0: # Utilization is defined in the range of [0 to GPUS_PER_NODE]. Loading @@ -106,7 +105,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): # Compute CPU power from CPU usage time # CPU usage is reported per core, while we need it in the range [0 to CPUS_PER_NODE] cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / CORES_PER_CPU # seconds cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / config['CORES_PER_CPU'] # seconds cpu_usage_array = np.array([cpu_usage] * samples) cpu_util = cpu_usage_array / wall_time cpu_trace = cpu_util # * CPUS_PER_NODE Loading @@ -122,7 +121,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): if reschedule: # Let the scheduler reschedule the jobs scheduled_nodes = None time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: scheduled_nodes = get_scheduled_nodes(row['allocation_id'], node_df) time_offset = compute_time_offset(row['begin_time'], earliest_begin_time) Loading
raps/dataloaders/marconi100.py +11 −12 Original line number Diff line number Diff line Loading @@ -57,7 +57,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): The list of parsed jobs. """ config = kwargs.get('config') globals().update(config) min_time = kwargs.get('min_time', None) reschedule = kwargs.get('reschedule') fastforward = kwargs.get('fastforward') Loading Loading @@ -104,10 +103,10 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): else: cpu_power = jobs_df.loc[jidx, 'cpu_power_consumption'] cpu_power_array = cpu_power.tolist() cpu_min_power = nodes_required * POWER_CPU_IDLE * CPUS_PER_NODE cpu_max_power = nodes_required * POWER_CPU_MAX * CPUS_PER_NODE cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) cpu_trace = cpu_util * CPUS_PER_NODE cpu_trace = cpu_util * config['CPUS_PER_NODE'] node_power = (jobs_df.loc[jidx, 'node_power_consumption']).tolist() mem_power = (jobs_df.loc[jidx, 'mem_power_consumption']).tolist() Loading @@ -119,18 +118,18 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): mem_power = mem_power[:min_length] gpu_power = (node_power - cpu_power - mem_power - ([nodes_required * NICS_PER_NODE * POWER_NIC] * len(node_power)) - ([nodes_required * POWER_NVME] * len(node_power))) - ([nodes_required * config['NICS_PER_NODE'] * config['POWER_NIC']] * len(node_power)) - ([nodes_required * config['POWER_NVME']] * len(node_power))) gpu_power_array = gpu_power.tolist() gpu_min_power = nodes_required * POWER_GPU_IDLE * GPUS_PER_NODE gpu_max_power = nodes_required * POWER_GPU_MAX * GPUS_PER_NODE gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] gpu_max_power = nodes_required * config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) gpu_trace = gpu_util * GPUS_PER_NODE gpu_trace = gpu_util * config['GPUS_PER_NODE'] priority = int(jobs_df.loc[jidx, 'priority']) # wall_time = jobs_df.loc[i, 'run_time'] wall_time = gpu_trace.size * TRACE_QUANTA # seconds wall_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds end_state = jobs_df.loc[jidx, 'job_state'] time_start = jobs_df.loc[jidx+1, 'start_time'] diff = time_start - time_zero Loading @@ -139,13 +138,13 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): time_offset = max(diff.total_seconds(), 0) else: # When extracting out a single job, run one iteration past the end of the job time_offset = UI_UPDATE_FREQ time_offset = config['UI_UPDATE_FREQ'] if fastforward: time_offset -= fastforward if reschedule: # Let the scheduler reschedule the jobs scheduled_nodes = None time_offset = next_arrival(1/JOB_ARRIVAL_TIME) time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: # Prescribed replay scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() Loading