Commit bfadf8c7 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Merge branch '65-adastra-dataloader-and-specific-raps-modifications' into 'main'

Resolve "Adastra dataloader and specific raps modifications"

Closes #65

See merge request !66
parents 57e3fc80 1962797f
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -34,11 +34,16 @@ Note: Requires python3.9 or greater.

## Open Telemetry dataset

Download `job_table.parquet` from https://zenodo.org/records/10127767
For Marconi supercomputer, download `job_table.parquet` from https://zenodo.org/records/10127767

    # Marconi100
    python main.py --system marconi100 -f ~/data/marconi100/job_table.parquet 

For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from https://zenodo.org/records/14007065
    # Adastra MI250
    python main.py --system adastraMI250 -f AdastaJobsMI250_15days.parquet 


## Snapshot of extracted workload data

To reduce the expense of extracting the needed data from the telemetry parquet files,
+25 −0
Original line number Diff line number Diff line
{
    "COOLING_EFFICIENCY": 0.945,
    "WET_BULB_TEMP": 290.0,
    "ZIP_CODE": 37831,
    "COUNTRY_CODE": "US",
    "FMU_PATH": "models/Simulator_olcf5_base.fmu",
    "FMU_COLUMN_MAPPING": {
        "T_sec_r_C": "Rack Return Temperature (\u00b0C)",
        "T_sec_s_C": "Rack Supply Temperature (\u00b0C)",
        "p_sec_r_psig": "Rack Supply Pressure (psig)",
        "p_sec_s_psig": "Rack Return Pressure (psig)",
        "V_flow_sec_GPM": "Rack Flowrate (gpm)",
        "T_prim_r_C": "Facility Return Temperature (\u00b0C)",
        "T_prim_s_C": "Facility Supply Temperature (\u00b0C)",
        "p_prim_s_psig": "Facility Supply Pressure (psig)",
        "p_prim_r_psig": "Facility Return Pressure (psig)",
        "V_flow_prim_GPM": "Facility Flowrate (gpm)",
        "W_flow_CDUP_kW": "Work Done By CDUP (kW)"	
    },
    "TEMPERATURE_KEY": "simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_Towb",
    "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW",
    "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW",
    "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW"

}
+18 −0
Original line number Diff line number Diff line
{
    "POWER_GPU_IDLE": 44,
    "POWER_GPU_MAX": 238,
    "POWER_CPU_IDLE": 90,
    "POWER_CPU_MAX": 280,
    "POWER_MEM": 37.13,
    "POWER_NIC": 20,
    "POWER_NVME": 0,
    "POWER_SWITCH": 250,
    "POWER_CDU": 8473.47,
    "POWER_UPDATE_FREQ": 15,
    "RECTIFIER_PEAK_THRESHOLD": 13670,
    "SIVOC_LOSS_CONSTANT": 13,
    "SIVOC_EFFICIENCY": 0.98,
    "RECTIFIER_LOSS_CONSTANT": 17,
    "RECTIFIER_EFFICIENCY": 0.96,
    "POWER_COST": 0.094
}
+17 −0
Original line number Diff line number Diff line
{
    "SEED": 42,
    "JOB_ARRIVAL_TIME": 900,
    "MTBF": 11,
    "TRACE_QUANTA": 20,
    "MIN_WALL_TIME": 60,
    "MAX_WALL_TIME": 43200,
    "UI_UPDATE_FREQ": 900,
    "MAX_NODES_PER_JOB": 324,
    "JOB_END_PROBS": {
        "COMPLETED": 0.63,
        "FAILED": 0.13,
        "CANCELLED": 0.12,
        "TIMEOUT": 0.11,
        "NODE_FAIL": 0.01
    }
}
+20 −0
Original line number Diff line number Diff line
{
    "NUM_CDUS": 1,
    "RACKS_PER_CDU": 3,
    "NODES_PER_RACK": 128,
    "RECTIFIERS_PER_RACK": 32,
    "CHASSIS_PER_RACK": 8,
    "NODES_PER_BLADE": 2,
    "SWITCHES_PER_CHASSIS": 4,
    "NICS_PER_NODE": 4,
    "RECTIFIERS_PER_CHASSIS": 4,
    "NODES_PER_RECTIFIER": 4,
    "MISSING_RACKS": [],
    "DOWN_NODES": [356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383],
    "CPUS_PER_NODE": 1,
    "GPUS_PER_NODE": 8,
    "CPU_PEAK_FLOPS": 2048E9,
    "GPU_PEAK_FLOPS": 21.120000E12,
    "CPU_FP_RATIO": 0.667,
    "GPU_FP_RATIO": 0.667
}
Loading