Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ARM
k8s
Kured
Commits
63c8c9d4
Commit
63c8c9d4
authored
Mar 15, 2020
by
Price, Zach
Browse files
Setup Gitlab CI pipeline
parent
834e7d19
Changes
3
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
0 → 100644
View file @
63c8c9d4
variables
:
IMAGE_REPO
:
camden.ornl.gov
KUBERNETES_HELPER_IMAGE
:
gitlab/gitlab-runner:latest
stages
:
-
binary
-
image
build_bin
:
image
:
circleci/golang:1.12.5
tags
:
-
arm-k8s
stage
:
binary
only
:
-
ornl
artifacts
:
paths
:
-
build/*
cache
:
paths
:
-
.cache
variables
:
CGO_ENABLED
:
0
GOOS
:
linux
GOARCH
:
amd64
script
:
-
mkdir -p build
-
export GOPATH=${CI_PROJECT_DIR}/.cache
-
go build -ldflags "-X main.version=ornl-$CI_COMMIT_SHORT_SHA" -o build/kured cmd/kured/*.go
-
cp cmd/kured/Dockerfile build/
build_image
:
image
:
buildah/buildah
tags
:
-
arm-k8s
stage
:
image
only
:
-
ornl
script
:
-
podman build --format docker -t arm/kured:latest -t arm/kured:ornl-$CI_COMMIT_SHORT_SHA -f ./build/Dockerfile
-
podman login -u "$REPO_ROBOT" -p "$REPO_TOKEN" "$IMAGE_REPO"
-
podman push localhost/arm/kured docker://$IMAGE_REPO/arm/kured
cmd/kured/Dockerfile
View file @
63c8c9d4
...
...
@@ -3,5 +3,5 @@ RUN apk update && apk add ca-certificates tzdata && rm -rf /var/cache/apk/*
# NB: you may need to update RBAC permissions when upgrading kubectl - see kured-rbac.yaml for details
ADD
https://storage.googleapis.com/kubernetes-release/release/v1.15.10/bin/linux/amd64/kubectl /usr/bin/kubectl
RUN
chmod
0755 /usr/bin/kubectl
COPY
./
kured /usr/bin/kured
COPY
kured /usr/bin/kured
ENTRYPOINT
["/usr/bin/kured"]
cmd/kured/main.go
deleted
100644 → 0
View file @
834e7d19
package
main
import
(
"fmt"
"math/rand"
"net/http"
"os"
"os/exec"
"regexp"
"time"
log
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/weaveworks/kured/pkg/alerts"
"github.com/weaveworks/kured/pkg/daemonsetlock"
"github.com/weaveworks/kured/pkg/delaytick"
"github.com/weaveworks/kured/pkg/notifications/slack"
"github.com/weaveworks/kured/pkg/timewindow"
)
var
(
version
=
"unreleased"
// Command line flags
period
time
.
Duration
dsNamespace
string
dsName
string
lockAnnotation
string
prometheusURL
string
alertFilter
*
regexp
.
Regexp
rebootSentinel
string
slackHookURL
string
slackUsername
string
slackChannel
string
podSelectors
[]
string
rebootDays
[]
string
rebootStart
string
rebootEnd
string
timezone
string
// Metrics
rebootRequiredGauge
=
prometheus
.
NewGaugeVec
(
prometheus
.
GaugeOpts
{
Subsystem
:
"kured"
,
Name
:
"reboot_required"
,
Help
:
"OS requires reboot due to software updates."
,
},
[]
string
{
"node"
})
)
func
init
()
{
prometheus
.
MustRegister
(
rebootRequiredGauge
)
}
func
main
()
{
rootCmd
:=
&
cobra
.
Command
{
Use
:
"kured"
,
Short
:
"Kubernetes Reboot Daemon"
,
Run
:
root
}
rootCmd
.
PersistentFlags
()
.
DurationVar
(
&
period
,
"period"
,
time
.
Minute
*
60
,
"reboot check period"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
dsNamespace
,
"ds-namespace"
,
"kube-system"
,
"namespace containing daemonset on which to place lock"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
dsName
,
"ds-name"
,
"kured"
,
"name of daemonset on which to place lock"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
lockAnnotation
,
"lock-annotation"
,
"weave.works/kured-node-lock"
,
"annotation in which to record locking node"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
prometheusURL
,
"prometheus-url"
,
""
,
"Prometheus instance to probe for active alerts"
)
rootCmd
.
PersistentFlags
()
.
Var
(
&
regexpValue
{
&
alertFilter
},
"alert-filter-regexp"
,
"alert names to ignore when checking for active alerts"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
rebootSentinel
,
"reboot-sentinel"
,
"/var/run/reboot-required"
,
"path to file whose existence signals need to reboot"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
slackHookURL
,
"slack-hook-url"
,
""
,
"slack hook URL for reboot notfications"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
slackUsername
,
"slack-username"
,
"kured"
,
"slack username for reboot notfications"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
slackChannel
,
"slack-channel"
,
""
,
"slack channel for reboot notfications"
)
rootCmd
.
PersistentFlags
()
.
StringArrayVar
(
&
podSelectors
,
"blocking-pod-selector"
,
nil
,
"label selector identifying pods whose presence should prevent reboots"
)
rootCmd
.
PersistentFlags
()
.
StringSliceVar
(
&
rebootDays
,
"reboot-days"
,
timewindow
.
EveryDay
,
"schedule reboot on these days"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
rebootStart
,
"start-time"
,
"0:00"
,
"schedule reboot only after this time of day"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
rebootEnd
,
"end-time"
,
"23:59:59"
,
"schedule reboot only before this time of day"
)
rootCmd
.
PersistentFlags
()
.
StringVar
(
&
timezone
,
"time-zone"
,
"UTC"
,
"use this timezone for schedule inputs"
)
if
err
:=
rootCmd
.
Execute
();
err
!=
nil
{
log
.
Fatal
(
err
)
}
}
// newCommand creates a new Command with stdout/stderr wired to our standard logger
func
newCommand
(
name
string
,
arg
...
string
)
*
exec
.
Cmd
{
cmd
:=
exec
.
Command
(
name
,
arg
...
)
cmd
.
Stdout
=
log
.
NewEntry
(
log
.
StandardLogger
())
.
WithField
(
"cmd"
,
cmd
.
Args
[
0
])
.
WithField
(
"std"
,
"out"
)
.
WriterLevel
(
log
.
InfoLevel
)
cmd
.
Stderr
=
log
.
NewEntry
(
log
.
StandardLogger
())
.
WithField
(
"cmd"
,
cmd
.
Args
[
0
])
.
WithField
(
"std"
,
"err"
)
.
WriterLevel
(
log
.
WarnLevel
)
return
cmd
}
func
sentinelExists
()
bool
{
// Relies on hostPID:true and privileged:true to enter host mount space
sentinelCmd
:=
newCommand
(
"/usr/bin/nsenter"
,
"-m/proc/1/ns/mnt"
,
"--"
,
"/usr/bin/test"
,
"-f"
,
rebootSentinel
)
if
err
:=
sentinelCmd
.
Run
();
err
!=
nil
{
switch
err
:=
err
.
(
type
)
{
case
*
exec
.
ExitError
:
// We assume a non-zero exit code means 'reboot not required', but of course
// the user could have misconfigured the sentinel command or something else
// went wrong during its execution. In that case, not entering a reboot loop
// is the right thing to do, and we are logging stdout/stderr of the command
// so it should be obvious what is wrong.
return
false
default
:
// Something was grossly misconfigured, such as the command path being wrong.
log
.
Fatalf
(
"Error invoking sentinel command: %v"
,
err
)
}
}
return
true
}
func
rebootRequired
()
bool
{
if
sentinelExists
()
{
log
.
Infof
(
"Reboot required"
)
return
true
}
else
{
log
.
Infof
(
"Reboot not required"
)
return
false
}
}
func
rebootBlocked
(
client
*
kubernetes
.
Clientset
,
nodeID
string
)
bool
{
if
prometheusURL
!=
""
{
alertNames
,
err
:=
alerts
.
PrometheusActiveAlerts
(
prometheusURL
,
alertFilter
)
if
err
!=
nil
{
log
.
Warnf
(
"Reboot blocked: prometheus query error: %v"
,
err
)
return
true
}
count
:=
len
(
alertNames
)
if
count
>
10
{
alertNames
=
append
(
alertNames
[
:
10
],
"..."
)
}
if
count
>
0
{
log
.
Warnf
(
"Reboot blocked: %d active alerts: %v"
,
count
,
alertNames
)
return
true
}
}
fieldSelector
:=
fmt
.
Sprintf
(
"spec.nodeName=%s"
,
nodeID
)
for
_
,
labelSelector
:=
range
podSelectors
{
podList
,
err
:=
client
.
CoreV1
()
.
Pods
(
""
)
.
List
(
metav1
.
ListOptions
{
LabelSelector
:
labelSelector
,
FieldSelector
:
fieldSelector
,
Limit
:
10
})
if
err
!=
nil
{
log
.
Warnf
(
"Reboot blocked: pod query error: %v"
,
err
)
return
true
}
if
len
(
podList
.
Items
)
>
0
{
podNames
:=
make
([]
string
,
0
,
len
(
podList
.
Items
))
for
_
,
pod
:=
range
podList
.
Items
{
podNames
=
append
(
podNames
,
pod
.
Name
)
}
if
len
(
podList
.
Continue
)
>
0
{
podNames
=
append
(
podNames
,
"..."
)
}
log
.
Warnf
(
"Reboot blocked: matching pods: %v"
,
podNames
)
return
true
}
}
return
false
}
func
holding
(
lock
*
daemonsetlock
.
DaemonSetLock
,
metadata
interface
{})
bool
{
holding
,
err
:=
lock
.
Test
(
metadata
)
if
err
!=
nil
{
log
.
Fatalf
(
"Error testing lock: %v"
,
err
)
}
if
holding
{
log
.
Infof
(
"Holding lock"
)
}
return
holding
}
func
acquire
(
lock
*
daemonsetlock
.
DaemonSetLock
,
metadata
interface
{})
bool
{
holding
,
holder
,
err
:=
lock
.
Acquire
(
metadata
)
switch
{
case
err
!=
nil
:
log
.
Fatalf
(
"Error acquiring lock: %v"
,
err
)
return
false
case
!
holding
:
log
.
Warnf
(
"Lock already held: %v"
,
holder
)
return
false
default
:
log
.
Infof
(
"Acquired reboot lock"
)
return
true
}
}
func
release
(
lock
*
daemonsetlock
.
DaemonSetLock
)
{
log
.
Infof
(
"Releasing lock"
)
if
err
:=
lock
.
Release
();
err
!=
nil
{
log
.
Fatalf
(
"Error releasing lock: %v"
,
err
)
}
}
func
drain
(
nodeID
string
)
{
log
.
Infof
(
"Draining node %s"
,
nodeID
)
if
slackHookURL
!=
""
{
if
err
:=
slack
.
NotifyDrain
(
slackHookURL
,
slackUsername
,
slackChannel
,
nodeID
);
err
!=
nil
{
log
.
Warnf
(
"Error notifying slack: %v"
,
err
)
}
}
drainCmd
:=
newCommand
(
"/usr/bin/kubectl"
,
"drain"
,
"--ignore-daemonsets"
,
"--delete-local-data"
,
"--force"
,
nodeID
)
if
err
:=
drainCmd
.
Run
();
err
!=
nil
{
log
.
Fatalf
(
"Error invoking drain command: %v"
,
err
)
}
}
func
uncordon
(
nodeID
string
)
{
log
.
Infof
(
"Uncordoning node %s"
,
nodeID
)
uncordonCmd
:=
newCommand
(
"/usr/bin/kubectl"
,
"uncordon"
,
nodeID
)
if
err
:=
uncordonCmd
.
Run
();
err
!=
nil
{
log
.
Fatalf
(
"Error invoking uncordon command: %v"
,
err
)
}
}
func
commandReboot
(
nodeID
string
)
{
log
.
Infof
(
"Commanding reboot"
)
if
slackHookURL
!=
""
{
if
err
:=
slack
.
NotifyReboot
(
slackHookURL
,
slackUsername
,
slackChannel
,
nodeID
);
err
!=
nil
{
log
.
Warnf
(
"Error notifying slack: %v"
,
err
)
}
}
// Relies on hostPID:true and privileged:true to enter host mount space
rebootCmd
:=
newCommand
(
"/usr/bin/nsenter"
,
"-m/proc/1/ns/mnt"
,
"/bin/systemctl"
,
"reboot"
)
if
err
:=
rebootCmd
.
Run
();
err
!=
nil
{
log
.
Fatalf
(
"Error invoking reboot command: %v"
,
err
)
}
}
func
maintainRebootRequiredMetric
(
nodeID
string
)
{
for
{
if
sentinelExists
()
{
rebootRequiredGauge
.
WithLabelValues
(
nodeID
)
.
Set
(
1
)
}
else
{
rebootRequiredGauge
.
WithLabelValues
(
nodeID
)
.
Set
(
0
)
}
time
.
Sleep
(
time
.
Minute
)
}
}
// nodeMeta is used to remember information across reboots
type
nodeMeta
struct
{
Unschedulable
bool
`json:"unschedulable"`
}
func
rebootAsRequired
(
nodeID
string
,
window
*
timewindow
.
TimeWindow
)
{
config
,
err
:=
rest
.
InClusterConfig
()
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
client
,
err
:=
kubernetes
.
NewForConfig
(
config
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
lock
:=
daemonsetlock
.
New
(
client
,
nodeID
,
dsNamespace
,
dsName
,
lockAnnotation
)
nodeMeta
:=
nodeMeta
{}
if
holding
(
lock
,
&
nodeMeta
)
{
if
!
nodeMeta
.
Unschedulable
{
uncordon
(
nodeID
)
}
release
(
lock
)
}
source
:=
rand
.
NewSource
(
time
.
Now
()
.
UnixNano
())
tick
:=
delaytick
.
New
(
source
,
period
)
for
_
=
range
tick
{
if
window
.
Contains
(
time
.
Now
())
&&
rebootRequired
()
&&
!
rebootBlocked
(
client
,
nodeID
)
{
node
,
err
:=
client
.
CoreV1
()
.
Nodes
()
.
Get
(
nodeID
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
nodeMeta
.
Unschedulable
=
node
.
Spec
.
Unschedulable
if
acquire
(
lock
,
&
nodeMeta
)
{
if
!
nodeMeta
.
Unschedulable
{
drain
(
nodeID
)
}
commandReboot
(
nodeID
)
for
{
log
.
Infof
(
"Waiting for reboot"
)
time
.
Sleep
(
time
.
Minute
)
}
}
}
}
}
func
root
(
cmd
*
cobra
.
Command
,
args
[]
string
)
{
log
.
Infof
(
"Kubernetes Reboot Daemon: %s"
,
version
)
nodeID
:=
os
.
Getenv
(
"KURED_NODE_ID"
)
if
nodeID
==
""
{
log
.
Fatal
(
"KURED_NODE_ID environment variable required"
)
}
window
,
err
:=
timewindow
.
New
(
rebootDays
,
rebootStart
,
rebootEnd
,
timezone
)
if
err
!=
nil
{
log
.
Fatalf
(
"Failed to build time window: %v"
,
err
)
}
log
.
Infof
(
"Node ID: %s"
,
nodeID
)
log
.
Infof
(
"Lock Annotation: %s/%s:%s"
,
dsNamespace
,
dsName
,
lockAnnotation
)
log
.
Infof
(
"Reboot Sentinel: %s every %v"
,
rebootSentinel
,
period
)
log
.
Infof
(
"Blocking Pod Selectors: %v"
,
podSelectors
)
log
.
Infof
(
"Reboot on: %v"
,
window
)
go
rebootAsRequired
(
nodeID
,
window
)
go
maintainRebootRequiredMetric
(
nodeID
)
http
.
Handle
(
"/metrics"
,
promhttp
.
Handler
())
log
.
Fatal
(
http
.
ListenAndServe
(
":8080"
,
nil
))
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment