Commit 9b6a484f authored by IamTao's avatar IamTao

minor.

parent 3d693ed4
This diff is collapsed.
version: '2.3'
services:
worker:
container_name: lin-worker
image: itamtao/pytorch-mpi:cuda9
shm_size: 8G
volumes:
- /mlo-container-scratch/tlin/:/mlodata1/tlin
command: ["sleep", "infinity"]
runtime: nvidia
environment:
JOBMONITOR_TIMESERIES_HOST: lin-timeseries
JOBMONITOR_METADATA_HOST: lin-metadata
networks:
- net
# notebook:
# container_name: lin-notebook
# image: itamtao/notebook
# user: root
# working_dir: /mlodata1/tlin
# volumes:
# - /mlo-container-scratch/tlin/:/mlodata1/tlin
# command: ["start-notebook.sh", "--NotebookApp.token=''"]
# runtime: nvidia
# environment:
# NB_UID: 144057
# NB_GID: 11169
# JOBMONITOR_TIMESERIES_HOST: lin-timeseries
# JOBMONITOR_METADATA_HOST: lin-metadata
# CHOWN_HOME: 'yes'
# CHOWN_HOME_OPTS: -R
# networks:
# - net
# ports:
# - 18888:8888
networks:
net:
localhost slots=32
\ No newline at end of file
# -*- coding: utf-8 -*-
import os
import re
import argparse
import subprocess
def str2bool(v):
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")
def get_args():
# feed them to the parser.
parser = argparse.ArgumentParser(description="Extract results for plots")
# add arguments.
parser.add_argument("--use_cuda", type=str2bool, default=True)
parser.add_argument("--num_cpus", type=str, default="4")
# parse args.
args = parser.parse_args()
return args
def write_txt(data, out_path, type="w"):
"""write the data to the txt file."""
with open(out_path, type) as f:
f.write(data)
def run_cmd(args):
return subprocess.check_output(args).decode("utf-8").strip().split("\n")
def get_existing_pod_names():
lines = run_cmd(["kubectl", "get", "pods"])[1:]
existing_pods_info = [re.split(r"\s+", line) for line in lines]
existing_pods_name = [
l[0]
for l in existing_pods_info
if ("lin-master" in l[0] or "lin-worker" in l[0]) and "Running" in l[2]
]
return existing_pods_name
def get_existing_pod_info(existing_pod_name):
def get(pattern, lines):
got_items = [re.findall(pattern, line, re.DOTALL) for line in lines]
return [item for item in got_items if len(item) != 0][0][0]
info = {}
raw = run_cmd(["kubectl", "describe", "pod", existing_pod_name])
info["name"] = get(r"^Name:\s+([\w-]+)", raw)
print(" processing {}".format(info["name"]))
info["namespace"] = get(r"^Namespace:\s+([\w-]+)", raw)
info["ip"] = get(r"^IP:\s+([\d.]+)", raw)
info["num_gpu"] = get(r"nvidia.com/gpu:\s+(\d)", raw)
return info
def get_existing_pods_info(existing_pod_names):
all_info = {}
for existing_pod_name in existing_pod_names:
all_info[existing_pod_name] = get_existing_pod_info(existing_pod_name)
return all_info
def save_hostfile(args, all_info):
ips = "\n".join(
[
"{} slots={}".format(
info["ip"], info["num_gpu"] if args.use_cuda else args.num_cpus
)
for key, info in all_info.items()
]
)
write_txt(ips, "hostfile")
return ips
def main(args):
print(" get pod names.")
existing_pod_names = get_existing_pod_names()
if len(existing_pod_names) == 0:
print(" does not exist pods.")
return
print(" get pod info.")
existing_pods_info = get_existing_pods_info(existing_pod_names)
print(" get IPs and save them to path.")
save_hostfile(args, existing_pods_info)
if __name__ == "__main__":
args = get_args()
main(args)
"""
Convenience functions for dealing with Kubernetes at MLO
- Listing user's pods or jobs
- Inspecting
- Deleting
- Cleanup of finished items
"""
import re
import subprocess
import os
from pprint import pprint
from typing import Any, Dict, Generator, Union
from kubernetes import client, config
from kubernetes.client import V1Job, V1Pod, V1Status
config.load_kube_config()
USER = os.getenv("USER")
NAMESPACE = "mlo"
def pods(running=None, all_users=False) -> Generator[V1Pod, None, None]:
"""
Generator for pods
:param running: if boolean (not None, we will filter by running yes/no)
"""
v1 = client.CoreV1Api()
if USER is not None and not all_users:
label_selector = f"user={USER}"
else:
label_selector = ""
for pod in v1.list_namespaced_pod(NAMESPACE, label_selector=label_selector).items:
if (
running is None # Don't care-mode
or (running and status(pod) == "running")
or (not running and status(pod) in ["succeeded", "failed"])
):
yield pod
def jobs(running=None) -> Generator[V1Job, None, None]:
"""
Generator for jobs
:param running: if boolean (not None, we will filter by running yes/no)
"""
v1 = client.BatchV1Api()
if USER is not None:
label_selector = f"user={USER}"
else:
label_selector = ""
for job in v1.list_namespaced_job(NAMESPACE, label_selector=label_selector).items:
if (
running is None # Don't care-mode
or (running and status(job) == "running")
or (not running and status(job) in ["finished", "failed"])
):
yield job
def status(entry: Union[V1Pod, V1Job]) -> str:
"""
Figure out what the status is of a Pod or Job
"""
if isinstance(entry, V1Job):
job = entry
completions = job.spec.completions if job.spec.completions else 0
succeeded = job.status.succeeded if job.status.succeeded else 0
failed = job.status.failed if job.status.failed else 0
if succeeded + failed < completions:
return "running"
elif succeeded == completions:
return "completed"
else:
return "failed"
elif isinstance(entry, V1Pod):
return entry.status.phase.lower()
else:
raise ValueError("Unknown object type")
def gpus(pod: V1Pod) -> int:
count = 0
for container in pod.spec.containers:
limits = container.resources.limits
if limits is not None:
count += int(limits.get("nvidia.com/gpu", 0))
return count
def describe(entry: Union[V1Pod, V1Job]) -> Dict[str, Any]:
"""
Describe an object from this file (either V1Pod or V1Job)
"""
if isinstance(entry, V1Pod):
info = {
"kind": "Pod",
"metadata.name": entry.metadata.name,
"metadata.labels": entry.metadata.labels,
"metadata.creation_timestamp": entry.metadata.creation_timestamp,
"[status]": status(entry),
"[gpus]": gpus(entry),
"status.start_time": entry.status.start_time,
}
pprint(info)
return info
elif isinstance(entry, V1Job):
info = {
"kind": "Job",
"metadata.name": entry.metadata.name,
"metadata.labels": entry.metadata.labels,
"metadata.creation_timestamp": entry.metadata.creation_timestamp,
"[status]": status(entry),
"status.start_time": entry.status.start_time,
}
pprint(info)
return info
else:
raise ValueError("Unknown object")
def delete(entry: Union[V1Pod, V1Job]) -> V1Status:
"""
Delete/kill an object from this file (either V1Pod or V1Job)
"""
if isinstance(entry, V1Pod):
v1 = client.CoreV1Api()
return v1.delete_namespaced_pod(
entry.metadata.name,
namespace=NAMESPACE,
body=client.V1DeleteOptions(propagation_policy="Foreground"),
)
if isinstance(entry, V1Job):
v1 = client.BatchV1Api()
return v1.delete_namespaced_job(
entry.metadata.name,
namespace=NAMESPACE,
body=client.V1DeleteOptions(propagation_policy="Foreground"),
)
else:
raise ValueError("Unknown object")
def cleanup():
"""
Delete non-running jobs and pods
"""
for job in jobs(running=False):
delete(job)
for pod in pods(running=False):
delete(pod)
def get_status(all_users=False):
for pod in pods(running=True, all_users=all_users):
if gpus(pod) > 0:
output = subprocess.check_output(
["kubectl", "exec", pod.metadata.name, "nvidia-smi"]
).decode("utf-8")
usage = re.findall(r"""\d+\%""", output)
print(f"{pod.metadata.name:30s}", usage)
apiVersion: apps/v1beta2
kind: StatefulSet
metadata:
name: lin-worker
labels:
name: lin-worker
user: lin
spec:
selector:
matchLabels:
name: lin-worker
serviceName: lin-worker
replicas: 4
template:
metadata:
labels:
name: lin-worker
user: lin
spec:
containers:
- name: lin-worker
image: ic-registry.epfl.ch/mlo/lin-pytorch
imagePullPolicy: Always
command:
- "/bin/bash"
- "-c"
- "--"
args :
- '/entrypoint.sh; /usr/local/bin/entrypoint.sh; sleep infinity'
ports:
- containerPort: 22
name: ssh
resources:
limits:
nvidia.com/gpu: 2
# cpu: 32
# memory: 85Gi
env:
- name: ROLE
value: worker
- name: JOBMONITOR_TIMESERIES_HOST
value: lin-timeseries
- name: JOBMONITOR_METADATA_HOST
value: lin-metadata
volumeMounts:
- mountPath: /mlodata1
name: mlodata
- mountPath: /dev/shm
name: dshm
volumes:
- name: mlodata
persistentVolumeClaim:
claimName: pv-mloraw1
- name: dshm
emptyDir:
medium: Memory
apiVersion: batch/v1
kind: Job
metadata:
name: lin-job-0704-1
labels:
app: lin-worker
user: lin
spec:
template:
metadata:
labels:
app: lin-worker
user: lin
spec:
restartPolicy: Never
containers:
- name: lin-worker
image: ic-registry.epfl.ch/mlo/lin-pytorch
workingDir: /mlodata1/tlin/decentralized_code_debug
imagePullPolicy: Always
env:
- name: ROLE
value: worker
- name: JOBMONITOR_TIMESERIES_HOST
value: lin-timeseries
- name: JOBMONITOR_METADATA_HOST
value: lin-metadata
volumeMounts:
- mountPath: /mlodata1
name: mlodata
- mountPath: /dev/shm
name: dshm
resources:
limits:
nvidia.com/gpu: 2
command: [
"/entrypoint.sh", "bash",
"exps/finetune-lstm/final_3.sh"
]
volumes:
- name: mlodata
persistentVolumeClaim:
claimName: pv-mloraw1
- name: dshm
emptyDir:
medium: Memory
backoffLimit: 4
apiVersion: v1
kind: Pod
metadata:
name: lin-master
labels:
name: lin-master
role: master
user: lin
spec:
containers:
- name: lin-master
image: ic-registry.epfl.ch/mlo/lin-pytorch
imagePullPolicy: Always
command:
- "/bin/bash"
- "-c"
- "--"
args :
- '/entrypoint.sh; /usr/local/bin/entrypoint.sh; sleep infinity'
ports:
- containerPort: 8888
name: notebook
- containerPort: 6006
name: tensorboard
- containerPort: 22
name: ssh
env:
- name: ROLE
value: master
- name: JOBMONITOR_TIMESERIES_HOST
value: lin-timeseries
- name: JOBMONITOR_METADATA_HOST
value: lin-metadata
resources:
requests:
nvidia.com/gpu: 2
cpu: 32
memory: 85Gi
limits:
nvidia.com/gpu: 2
cpu: 32
memory: 85Gi
volumeMounts:
- mountPath: /mlodata1
name: mlodata
- mountPath: /dev/shm
name: dshm
volumes:
- name: mlodata
persistentVolumeClaim:
claimName: mlo-scratch
- name: dshm
emptyDir:
medium: Memory
---
apiVersion: v1
kind: Service
metadata:
name: lin-master
labels:
name: lin-master
spec:
type: LoadBalancer
ports:
- port: 8888
name: notebook
- port: 6006
name: tensorboard
- port: 22
name: ssh
selector:
name: lin-master
apiVersion: v1
kind: Pod
metadata:
name: lin-metadata
labels:
name: lin-metadata
user: lin
spec:
containers:
- name: mongo
image: mongo:4.0
ports:
- containerPort: 27017
name: api
volumeMounts:
- mountPath: /data/db
name: mlodata
subPath: tlin/databases/metadata-mongodb
volumes:
- name: mlodata
persistentVolumeClaim:
claimName: mlo-scratch
---
apiVersion: v1
kind: Service
metadata:
name: lin-metadata
labels:
user: lin
spec:
type: NodePort
ports:
- name: http
port: 27017
protocol: TCP
selector:
name: lin-metadata
apiVersion: v1
kind: Pod
metadata:
name: lin-notebook
labels:
name: lin-notebook
user: lin
spec:
containers:
- name: lin-notebook
securityContext:
runAsUser: 0
image: ic-registry.epfl.ch/mlo/lin-notebook
workingDir: /mlodata1/tlin
ports:
- containerPort: 8888
name: notebook
resources:
limits:
nvidia.com/gpu: 0
env:
- name: NB_UID
value: "144057"
- name: NB_GID
value: "11169"
- name: CHOWN_HOME
value: 'yes'
- name: CHOWN_HOME_OPTS
value: "-R"
command: ["start-notebook.sh"]
args: ["--NotebookApp.token=''"]
volumeMounts:
- mountPath: /mlodata1
name: mlodata
- mountPath: /dev/shm
name: dshm
volumes:
- name: mlodata
persistentVolumeClaim:
claimName: pv-mloraw1
- name: dshm
emptyDir:
medium: Memory
---
apiVersion: v1
kind: Service
metadata:
name: lin-notebook
labels:
user: lin
spec:
type: NodePort
ports:
- name: notebook
nodePort: 32688
port: 8888
targetPort: 8888
protocol: TCP
selector:
name: lin-notebook
apiVersion: v1
kind: Pod
metadata:
name: lin-timeseries
labels:
name: lin-timeseries
user: lin
spec:
containers:
- name: influxdb
image: influxdb:1.7
ports:
- containerPort: 8086
name: api
volumeMounts: