Loading pyproject.toml +2 −2 Original line number Diff line number Diff line Loading @@ -25,8 +25,8 @@ dependencies = [ "uvicorn==0.35.0", "sqlparse==0.5.3", "kubernetes==33.1.0", "elasticsearch==7.13.4", "elasticsearch-dbapi==0.2.11", "tenacity==9.1.2", "elasticsearch==7.17.12", "requests==2.32.5", "orjson==3.11.3", "confluent_kafka==2.11.1", Loading simulation_server/simulation/dataloaders/frontier.py 0 → 100644 +67 −0 Original line number Diff line number Diff line from ...util.druid import get_druid_engine, get_table, to_timestamp from ...util.dataloader import query_time_range from ...util.es import get_nccs_cadence_es, es_sql_query from ...models.sim import ServerSimConfig from .. import SimException import sqlalchemy as sqla import pandas as pd # Re-use these from the raps dataloader from raps.dataloaders.frontier import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos def load_data(_paths, **kwargs): # TODO: Should consider using LVA API instead of directly querying the DB for this druid_engine = get_druid_engine() es = get_nccs_cadence_es() sim_config: ServerSimConfig = kwargs['sim_config'] start, end = sim_config.start, sim_config.end job_query = """ SELECT "allocation_id", "job_id", "slurm_version", "account", "group", "user", "name", "time_limit", "time_submission", "time_eligible", "time_start", "time_end", "time_elapsed", "node_count", xnames_str AS "xnames", "state_current", "state_reason", "time_snapshot" FROM "stf218.frontier.job-summary" WHERE (time_end IS NULL OR time_end > CONVERT(?, TIMESTAMP)) AND (time_start IS NOT NULL AND time_start <= CONVERT(?, TIMESTAMP)) """ job_query_params = [start.isoformat(), end.isoformat()] job_data = es_sql_query(es, job_query, job_query_params, fetch_size=500) job_df = pd.DataFrame(job_data) job_df['time_snapshot'] = pd.to_datetime(job_df['time_snapshot']) job_df["time_submission"] = pd.to_datetime(job_df["time_submission"]) job_df["time_eligible"] = pd.to_datetime(job_df["time_eligible"]) job_df["time_start"] = pd.to_datetime(job_df["time_start"]) job_df["time_end"] = pd.to_datetime(job_df["time_end"]) job_df['xnames'] = job_df['xnames'].map(lambda x: x.split(",") if x else []) job_profile_tbl = get_table("pub-ts-frontier-job-profile", druid_engine) job_profile_query = ( sqla.select( job_profile_tbl.c['__time'].label("timestamp"), job_profile_tbl.c.allocation_id, job_profile_tbl.c.sum_cpu0_power, job_profile_tbl.c.sum_gpu_power, ) .where( to_timestamp(start) <= job_profile_tbl.c['__time'], job_profile_tbl.c['__time'] < to_timestamp(end), ) ) job_profile_df = pd.read_sql(job_profile_query, druid_engine, parse_dates=[ "timestamp", ]) from loguru import logger logger.info(f"job_df {job_df}") logger.info(f"job_profile_df {job_profile_df}") if (job_df.empty or job_profile_df.empty): raise SimException(f"No telemetry data for {start.isoformat()} -> {end.isoformat()}") return load_data_from_df(job_df, job_profile_df, **kwargs) simulation_server/util/es.py +45 −37 Original line number Diff line number Diff line """ Connection to Cadence ES """ import os, json import urllib.parse from datetime import datetime import sqlalchemy as sqla from sqlalchemy.engine import Engine, create_engine import os from elasticsearch import Elasticsearch from es.elastic.sqlalchemy import ESDialect import tenacity def get_nccs_cadence_engine(**kwargs) -> Engine: import sqlalchemy.types as types from sqlalchemy.ext.compiler import compiles # For some reason sqla/pydruid renders `cast(col, sqla.TIMESTAMP)` to `CAST(col AS LONG)`. This # is a manual override to make sqla render them properly. cast_fixes = { types.TIMESTAMP: "TIMESTAMP", } for (sqla_type, override) in cast_fixes.items(): compiles(sqla_type, "elasticsearch")(lambda type_, compiler, override=override, **kw: override) # We need to set retry_on_status to work around intermittent 401 errors from Cadence ES. # The query params will get passed to the Elasticsearch client, but only some specific # ones get parsed and the rest are left as strings. This monkey patch hacks elasticsearch-dbapi # to parse retry_on_status. We can remove this if the AM team fixes the auth errors import es.basesqlalchemy es.basesqlalchemy.BaseESDialect._map_parse_connection_parameters['retry_on_status'] = json.loads URL = urllib.parse.urlparse(os.environ["NCCS_CADENCE_URL"]) HOST, PORT = URL.netloc.split(":") def get_nccs_cadence_es(): URL = os.environ["NCCS_CADENCE_URL"] USER = os.environ["NCCS_CADENCE_USER"] PASSWORD = os.environ["NCCS_CADENCE_PASSWORD"] # These get passed through to the internal Elasticsearch instance QUERY_PARAMS = 'use_ssl=false&ssl_show_warn=false&verify_certs=false&retry_on_status=[502,503,504,401]' return Elasticsearch( URL, http_auth=(USER, PASSWORD), # TODO: we need to fix the self-signed certs on ES use_ssl=False, ssl_show_warn=False, verify_certs=False, ) engine = create_engine(f'elasticsearch+{URL.scheme}://{USER}:{PASSWORD}@{HOST}:{PORT}{URL.path}?{QUERY_PARAMS}', **kwargs) return engine def to_timestamp(val: datetime): return sqla.func.convert(val.isoformat(), sqla.literal_column('TIMESTAMP')) def es_sql_query(client: Elasticsearch, query: str, params: list = [], fetch_size = 100): """ Runs an SQL query against ES. Use `?` format for SQL params. """ # Cadence ES is a bit flaky with intermittent 401 errors @tenacity.retry( stop = tenacity.stop_after_attempt(5), wait = tenacity.wait_exponential(multiplier=0.5, min=1, max=30), reraise = True, ) def _retry_query(query, params, cursor = None): body = { "query": query, "params": params, "fetch_size": fetch_size, } if cursor: body["cursor"] = cursor return client.sql.query(format = 'json', body = body) response = _retry_query(query, params) rows = response['rows'] cursor = response.get("cursor") columns = [c['name'] for c in response['columns']] while cursor: response = _retry_query(query, params, cursor) rows.extend(response['rows']) cursor = response.get("cursor") rows = [dict(zip(columns, row)) for row in rows] return rows Loading
pyproject.toml +2 −2 Original line number Diff line number Diff line Loading @@ -25,8 +25,8 @@ dependencies = [ "uvicorn==0.35.0", "sqlparse==0.5.3", "kubernetes==33.1.0", "elasticsearch==7.13.4", "elasticsearch-dbapi==0.2.11", "tenacity==9.1.2", "elasticsearch==7.17.12", "requests==2.32.5", "orjson==3.11.3", "confluent_kafka==2.11.1", Loading
simulation_server/simulation/dataloaders/frontier.py 0 → 100644 +67 −0 Original line number Diff line number Diff line from ...util.druid import get_druid_engine, get_table, to_timestamp from ...util.dataloader import query_time_range from ...util.es import get_nccs_cadence_es, es_sql_query from ...models.sim import ServerSimConfig from .. import SimException import sqlalchemy as sqla import pandas as pd # Re-use these from the raps dataloader from raps.dataloaders.frontier import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos def load_data(_paths, **kwargs): # TODO: Should consider using LVA API instead of directly querying the DB for this druid_engine = get_druid_engine() es = get_nccs_cadence_es() sim_config: ServerSimConfig = kwargs['sim_config'] start, end = sim_config.start, sim_config.end job_query = """ SELECT "allocation_id", "job_id", "slurm_version", "account", "group", "user", "name", "time_limit", "time_submission", "time_eligible", "time_start", "time_end", "time_elapsed", "node_count", xnames_str AS "xnames", "state_current", "state_reason", "time_snapshot" FROM "stf218.frontier.job-summary" WHERE (time_end IS NULL OR time_end > CONVERT(?, TIMESTAMP)) AND (time_start IS NOT NULL AND time_start <= CONVERT(?, TIMESTAMP)) """ job_query_params = [start.isoformat(), end.isoformat()] job_data = es_sql_query(es, job_query, job_query_params, fetch_size=500) job_df = pd.DataFrame(job_data) job_df['time_snapshot'] = pd.to_datetime(job_df['time_snapshot']) job_df["time_submission"] = pd.to_datetime(job_df["time_submission"]) job_df["time_eligible"] = pd.to_datetime(job_df["time_eligible"]) job_df["time_start"] = pd.to_datetime(job_df["time_start"]) job_df["time_end"] = pd.to_datetime(job_df["time_end"]) job_df['xnames'] = job_df['xnames'].map(lambda x: x.split(",") if x else []) job_profile_tbl = get_table("pub-ts-frontier-job-profile", druid_engine) job_profile_query = ( sqla.select( job_profile_tbl.c['__time'].label("timestamp"), job_profile_tbl.c.allocation_id, job_profile_tbl.c.sum_cpu0_power, job_profile_tbl.c.sum_gpu_power, ) .where( to_timestamp(start) <= job_profile_tbl.c['__time'], job_profile_tbl.c['__time'] < to_timestamp(end), ) ) job_profile_df = pd.read_sql(job_profile_query, druid_engine, parse_dates=[ "timestamp", ]) from loguru import logger logger.info(f"job_df {job_df}") logger.info(f"job_profile_df {job_profile_df}") if (job_df.empty or job_profile_df.empty): raise SimException(f"No telemetry data for {start.isoformat()} -> {end.isoformat()}") return load_data_from_df(job_df, job_profile_df, **kwargs)
simulation_server/util/es.py +45 −37 Original line number Diff line number Diff line """ Connection to Cadence ES """ import os, json import urllib.parse from datetime import datetime import sqlalchemy as sqla from sqlalchemy.engine import Engine, create_engine import os from elasticsearch import Elasticsearch from es.elastic.sqlalchemy import ESDialect import tenacity def get_nccs_cadence_engine(**kwargs) -> Engine: import sqlalchemy.types as types from sqlalchemy.ext.compiler import compiles # For some reason sqla/pydruid renders `cast(col, sqla.TIMESTAMP)` to `CAST(col AS LONG)`. This # is a manual override to make sqla render them properly. cast_fixes = { types.TIMESTAMP: "TIMESTAMP", } for (sqla_type, override) in cast_fixes.items(): compiles(sqla_type, "elasticsearch")(lambda type_, compiler, override=override, **kw: override) # We need to set retry_on_status to work around intermittent 401 errors from Cadence ES. # The query params will get passed to the Elasticsearch client, but only some specific # ones get parsed and the rest are left as strings. This monkey patch hacks elasticsearch-dbapi # to parse retry_on_status. We can remove this if the AM team fixes the auth errors import es.basesqlalchemy es.basesqlalchemy.BaseESDialect._map_parse_connection_parameters['retry_on_status'] = json.loads URL = urllib.parse.urlparse(os.environ["NCCS_CADENCE_URL"]) HOST, PORT = URL.netloc.split(":") def get_nccs_cadence_es(): URL = os.environ["NCCS_CADENCE_URL"] USER = os.environ["NCCS_CADENCE_USER"] PASSWORD = os.environ["NCCS_CADENCE_PASSWORD"] # These get passed through to the internal Elasticsearch instance QUERY_PARAMS = 'use_ssl=false&ssl_show_warn=false&verify_certs=false&retry_on_status=[502,503,504,401]' return Elasticsearch( URL, http_auth=(USER, PASSWORD), # TODO: we need to fix the self-signed certs on ES use_ssl=False, ssl_show_warn=False, verify_certs=False, ) engine = create_engine(f'elasticsearch+{URL.scheme}://{USER}:{PASSWORD}@{HOST}:{PORT}{URL.path}?{QUERY_PARAMS}', **kwargs) return engine def to_timestamp(val: datetime): return sqla.func.convert(val.isoformat(), sqla.literal_column('TIMESTAMP')) def es_sql_query(client: Elasticsearch, query: str, params: list = [], fetch_size = 100): """ Runs an SQL query against ES. Use `?` format for SQL params. """ # Cadence ES is a bit flaky with intermittent 401 errors @tenacity.retry( stop = tenacity.stop_after_attempt(5), wait = tenacity.wait_exponential(multiplier=0.5, min=1, max=30), reraise = True, ) def _retry_query(query, params, cursor = None): body = { "query": query, "params": params, "fetch_size": fetch_size, } if cursor: body["cursor"] = cursor return client.sql.query(format = 'json', body = body) response = _retry_query(query, params) rows = response['rows'] cursor = response.get("cursor") columns = [c['name'] for c in response['columns']] while cursor: response = _retry_query(query, params, cursor) rows.extend(response['rows']) cursor = response.get("cursor") rows = [dict(zip(columns, row)) for row in rows] return rows