Commit b951ad92 authored by Hines, Jesse's avatar Hines, Jesse
Browse files

Add fugaku downloads

parent 6dc3ffa2
Loading
Loading
Loading
Loading
+39 −10
Original line number Diff line number Diff line
"""
    Download parquet files from https://zenodo.org/records/11467483
Use the fugaku dataset published at https://zenodo.org/records/11467483

Note that F-Data doesn't give a list of nodes used, so we set 'scheduled_nodes' to None
which triggers the scheduler to schedule the nodes itself.
@@ -16,6 +16,11 @@
"""
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
import urllib.request
import requests
from ..job import job_dict, Job
from ..utils import WorkloadData

@@ -180,3 +185,27 @@ def cdu_index_to_name(index: int, config: dict):
def cdu_pos(index: int, config: dict) -> tuple[int, int]:
    """ Return (row, col) tuple for a cdu index """
    return (0, index)  # TODO


def download(dest: Path, start: datetime | None, end: datetime | None):
    tz = ZoneInfo("Asia/Tokyo")

    files = requests.get("https://zenodo.org/api/records/11467483").json()["files"]
    files = [f for f in files if f['key'].endswith(".parquet")]
    files = sorted(files, key = lambda f: f['key'])

    # TODO: I think fugaku data is indexed by submission time not start time, so filtering by
    # filename will probably miss some jobs that ran over start -> end
    if start:
        start_file = start.astimezone(tz).strftime("%y_%m.parquet")
        files = [f for f in files if f['key'] >= start_file]
    if end:
        end_file = end.astimezone(tz).strftime("%y_%m.parquet")
        files = [f for f in files if f['key'] <= end_file]
    
    dest.mkdir(parents = True)
    for file in files:
        print(f"Downloading {file['key']}")
        urllib.request.urlretrieve(file['links']['self'], dest / file['key'])

    print("Done!")