Commit 1da4a195 authored by ArunavoDey's avatar ArunavoDey
Browse files

adding new files to branch arunavo

parent b328eb6d
Loading
Loading
Loading
Loading
+29 −2
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ the FMU cooling model by providing CDU-level power inputs to the cooling model,
and reporting the statistics back to the user. RAPS also has built-in plotting
capabilities to generate plots of power and cooling at the end of simulation runs.
An optional RAPS dashboard is also provided, which requires also running the RAPS server.
Instructions for setup and usage are given below. 
Instructions for setup and usage are given below. An online documentation of ExaDigiT with a sub part concerning RAPS is also available [here](https://exadigit.readthedocs.io/en/latest/).

## Setup environment

@@ -34,11 +34,16 @@ Note: Requires python3.9 or greater.

## Open Telemetry dataset

Download `job_table.parquet` from https://zenodo.org/records/10127767
For Marconi supercomputer, download `job_table.parquet` from https://zenodo.org/records/10127767

    # Marconi100
    python main.py --system marconi100 -f ~/data/marconi100/job_table.parquet 

For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from https://zenodo.org/records/14007065

    # Adastra MI250
    python main.py --system adastraMI250 -f AdastaJobsMI250_15days.parquet 

## Snapshot of extracted workload data

To reduce the expense of extracting the needed data from the telemetry parquet files,
@@ -47,6 +52,28 @@ given instead of the parquet files for more quickly running subsequent simulatio

    python main.py -f jobs_2024-02-20_12-20-39.npz

## Support for multiple system partitions

Multi-partition systems are supported by running the `multi-part-sim.py` script, where a list of configurations can be specified using the `-x` flag as follows:

    python multi-part-sim.py -x setonix/part-cpu setonix/part-gpu

or simply:

    python multi-part-sim.py -x setonix/* # bash

    python multi-part-sim.py -x 'setonix/*' # zsh

This will simulate synthetic workloads on two partitions as defined in `config/setonix-cpu` and `config/setonix-gpu`. To replay telemetry workloads from another system, e.g., Marconi100's PM100 dataset, first create a .npz snapshot of the telemetry data, e.g., 

    python main.py --system marconi100 -f /path/to/marconi100/job_table.parquet

This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used with `multi-part-sim.py` as follows:

    python multi-part-sim.py -x setonix/* -f pm100.npz --reschedule poisson --scale 192

The `--reschedule` flag will use the internal scheduler to determine what nodes to schedule for each job, and the `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition). 

## Job-level power output example for replay of single job

    python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --jid 1234567 -o

api_client/README.md

0 → 100644
+6 −0
Original line number Diff line number Diff line
API documentation availalbe at: https://exadigit.github.io/SimulationServer/

export BASE_URL="https://myurl.com"
python get_api_token.py
python api_client.py list
python api_client.py details --id 5rkkb222xnge7c4ba4oxshqeha
+131 −0
Original line number Diff line number Diff line
import os
import argparse
import requests
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
URL = os.getenv("BASE_URL")
RAPS_URL = os.path.join(URL, "exadigit/api")

def read_token():
    with open('.api-token', 'r') as token_file:
        return token_file.read().strip()

def call_api(endpoint, method="GET", params=None, data=None):
    TOKEN = read_token()
    url = f"{RAPS_URL}{endpoint}"
    headers = {"Authorization": f"Bearer {TOKEN}"}
    
    response = requests.request(method, url, headers=headers, params=params, json=data)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

def handle_run(args):
    data = {"system": args.system, "policy": args.policy, "parameters": args.parameters}
    response = call_api('/simulation/run', method="POST", data=data)
    print(response)

def handle_list(args):
    response = call_api('/simulation/list')
    if response:
        results = response.get('results', [])
        df = pd.DataFrame(results)
        #pd.set_option('display.max_columns', None)
        #pd.set_option('display.max_colwidth', None)
        #pd.set_option('display.width', None)
        print(df)

def handle_simulation_details(args):
    response = call_api(f'/simulation/{args.id}')
    print(response)

def handle_cooling_cdu(args):
    response = call_api(f'/simulation/{args.id}/cooling/cdu')
    print(response)

def handle_cooling_cep(args):
    response = call_api(f'/simulation/{args.id}/cooling/cep')
    print(response)

def handle_scheduler_jobs(args):
    response = call_api(f'/simulation/{args.id}/scheduler/jobs')
    print(response)

def handle_power_history(args):
    response = call_api(f'/simulation/{args.id}/scheduler/jobs/{args.job_id}/power-history')
    print(response)

def handle_scheduler_system(args):
    response = call_api(f'/simulation/{args.id}/scheduler/system')
    print(response)

def handle_system_info(args):
    response = call_api(f'/system-info/{args.system}')
    print(response)

def main():
    parser = argparse.ArgumentParser(description="Interact with the SimulationServer REST API.")
    subparsers = parser.add_subparsers(title="commands", dest="command")
    
    # Run simulation
    run_parser = subparsers.add_parser("run", help="Run a simulation.")
    run_parser.add_argument("--system", required=True, help="System to run the simulation on.")
    run_parser.add_argument("--policy", required=True, help="Policy to use.")
    run_parser.add_argument("--parameters", type=dict, default={}, help="Simulation parameters.")
    run_parser.set_defaults(func=handle_run)
    
    # List simulations
    list_parser = subparsers.add_parser("list", help="List all simulations.")
    list_parser.set_defaults(func=handle_list)
    
    # Get simulation details
    details_parser = subparsers.add_parser("details", help="Get details of a simulation.")
    details_parser.add_argument("--id", required=True, help="Simulation ID.")
    details_parser.set_defaults(func=handle_simulation_details)
    
    # Cooling CDU
    cdu_parser = subparsers.add_parser("cooling-cdu", help="Get cooling CDU data for a simulation.")
    cdu_parser.add_argument("--id", required=True, help="Simulation ID.")
    cdu_parser.set_defaults(func=handle_cooling_cdu)
    
    # Cooling CEP
    cep_parser = subparsers.add_parser("cooling-cep", help="Get cooling CEP data for a simulation.")
    cep_parser.add_argument("--id", required=True, help="Simulation ID.")
    cep_parser.set_defaults(func=handle_cooling_cep)
    
    # Scheduler jobs
    jobs_parser = subparsers.add_parser("scheduler-jobs", help="Get scheduler jobs for a simulation.")
    jobs_parser.add_argument("--id", required=True, help="Simulation ID.")
    jobs_parser.set_defaults(func=handle_scheduler_jobs)
    
    # Power history
    power_parser = subparsers.add_parser("power-history", help="Get power history for a specific job in a simulation.")
    power_parser.add_argument("--id", required=True, help="Simulation ID.")
    power_parser.add_argument("--job-id", required=True, help="Job ID.")
    power_parser.set_defaults(func=handle_power_history)
    
    # Scheduler system
    scheduler_parser = subparsers.add_parser("scheduler-system", help="Get scheduler system data for a simulation.")
    scheduler_parser.add_argument("--id", required=True, help="Simulation ID.")
    scheduler_parser.set_defaults(func=handle_scheduler_system)
    
    # System info
    system_info_parser = subparsers.add_parser("system-info", help="Get system information.")
    system_info_parser.add_argument("--system", required=True, help="System name.")
    system_info_parser.set_defaults(func=handle_system_info)
    
    # Parse and execute
    args = parser.parse_args()
    if args.command:
        args.func(args)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()
+28 −0
Original line number Diff line number Diff line
import requests, getpass, argparse
from pathlib import Path
import os

BASE_URL = os.getenv("BASE_URL")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dest", type = Path, default = "./.api-token")
    parser.add_argument("--user")
    args = parser.parse_args()

    user = args.user
    if not user: user = input('USERNAME: ')
    password = getpass.getpass("PASSCODE: ")

    response = requests.post(f'{BASE_URL}/token', data = {
        "username": user, 'password': password,
    })

    if not response.ok:
        print(response.json().get('message', response.text))
    else:
        token = response.json()['access_token']
        args.dest.parent.mkdir(exist_ok = True, parents = True)
        args.dest.write_text(token)
        print(f"Success! Token saved to {args.dest}")
+2 −0
Original line number Diff line number Diff line
pandas==1.5.3
requests==2.28.1
Loading