Commit 61d5c605 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Merge branch 'lassen' into 'main'

Add support for Lassen supercomputer and telemetry

See merge request !56
parents 5327aed8 23d40acc
Loading
Loading
Loading
Loading
+8 −10
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@ Note: Requires python3.9 or greater.
Download `job_table.parquet` from https://zenodo.org/records/10127767

    # Marconi100
    python main.py --system marconi100 -f ~/data/job_table.parquet 
    python main.py --system marconi100 -f ~/data/marconi100/job_table.parquet 

## Snapshot of extracted workload data

@@ -55,15 +55,6 @@ given instead of the parquet files for more quickly running subsequent simulatio

    python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR

## OpenStreetMap Attribution

Map data used in this project is provided by [OpenStreetMap](https://www.openstreetmap.org/copyright) and is available under the Open Database License (ODbL). © OpenStreetMap contributors.

## Open-Meteo API Attribution

Weather data used in this project is provided by the [Open-Meteo API](https://open-meteo.com/en/docs). Open-Meteo offers free weather forecast data for various applications, and their API provides easy access to weather information without requiring user authentication.


## Build and run Docker container

    make docker_build && make docker_run
@@ -90,3 +81,10 @@ All new contributions must be made under both the MIT and Apache-2.0 licenses.
See LICENSE-MIT, LICENSE-APACHE, COPYRIGHT, NOTICE, and CONTRIBUTORS.txt for details.  

SPDX-License-Identifier: (Apache-2.0 OR MIT)  

## Attributions

Map data used in this project is provided by [OpenStreetMap](https://www.openstreetmap.org/copyright) and is available under the Open Database License (ODbL). © OpenStreetMap contributors.

Weather data used in this project is provided by the [Open-Meteo API](https://open-meteo.com/en/docs). Open-Meteo offers free weather forecast data for various applications, and their API provides easy access to weather information without requiring user authentication.
+18 −0
Original line number Diff line number Diff line
{
    "POWER_GPU_IDLE": 75,
    "POWER_GPU_MAX": 300,
    "POWER_CPU_IDLE": 47.25,
    "POWER_CPU_MAX": 252,
    "POWER_MEM": 74.26,
    "POWER_NIC": 21,
    "POWER_NVME": 45,
    "POWER_SWITCH": 250,
    "POWER_CDU": 0,
    "POWER_UPDATE_FREQ": 20,
    "RECTIFIER_PEAK_THRESHOLD": 13670,
    "SIVOC_LOSS_CONSTANT": 0,
    "SIVOC_EFFICIENCY": 1,
    "RECTIFIER_LOSS_CONSTANT": 0,
    "RECTIFIER_EFFICIENCY": 1,
    "POWER_COST": 0.094
}
+18 −0
Original line number Diff line number Diff line
{
    "SEED": 42,
    "JOB_ARRIVAL_TIME": 20,
    "MTBF": 11,
    "MAX_TIME": 88200,
    "TRACE_QUANTA": 20,
    "MIN_WALL_TIME": 3600,
    "MAX_WALL_TIME": 43200,
    "UI_UPDATE_FREQ": 3600,
    "MAX_NODES_PER_JOB": 3000,
    "JOB_END_PROBS": {
        "COMPLETED": 0.63,
        "FAILED": 0.13,
        "CANCELLED": 0.12,
        "TIMEOUT": 0.11,
        "NODE_FAIL": 0.01
    }
}
+20 −0
Original line number Diff line number Diff line
{
    "NUM_CDUS": 15,
    "RACKS_PER_CDU": 3,
    "NODES_PER_RACK": 18,
    "RECTIFIERS_PER_RACK": 5,
    "CHASSIS_PER_RACK": 1,
    "NODES_PER_BLADE": 1,
    "SWITCHES_PER_CHASSIS": 5,
    "NICS_PER_NODE": 2, 
    "RECTIFIERS_PER_CHASSIS": 5,
    "NODES_PER_RECTIFIER": 4,
    "MISSING_RACKS": [44],
    "DOWN_NODES": [],
    "CPUS_PER_NODE": 2,
    "GPUS_PER_NODE": 4,
    "CPU_PEAK_FLOPS": 396.8E9,
    "GPU_PEAK_FLOPS": 7.8E12,
    "CPU_FP_RATIO": 0.69,
    "GPU_FP_RATIO": 0.69
}
+5 −1
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@ parser.add_argument('-d', '--debug', action='store_true', help='Enable debug mod
parser.add_argument('-e', '--encrypt', action='store_true', help='Encrypt any sensitive data in telemetry')
parser.add_argument('-n', '--numjobs', type=int, default=1000, help='Number of jobs to schedule')
parser.add_argument('-t', '--time', type=str, default=None, help='Length of time to simulate, e.g., 123, 123s, 27m, 3h, 7d')
parser.add_argument('-ff', '--fastforward', type=str, default=None, help='Fast-forward by time amount (uses same units as -t)')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output')
parser.add_argument('--seed', action='store_true', help='Set random number seed for deterministic simulation')
parser.add_argument('-f', '--replay', nargs='+', type=str, help='Either: path/to/joblive path/to/jobprofile' + \
@@ -107,6 +108,9 @@ sc = Scheduler(TOTAL_NODES, DOWN_NODES, power_manager, flops_manager, layout_man
               cooling_model, **args_dict)

if args.replay:

    if args.fastforward: args.fastforward = convert_to_seconds(args.fastforward)

    td = Telemetry(**args_dict)

    # Try to extract date from given name to use as case directory
@@ -133,7 +137,7 @@ if args.replay:
    else:
        timesteps = int(max(job['wall_time'] + job['submit_time'] for job in jobs)) + 1

    print(f'Running simulation for {timesteps} seconds')
    print(f'Simulating {len(jobs)} jobs for {timesteps} seconds')
    time.sleep(1)

else:
Loading