Unverified Commit 718faf4d authored by Christian Kotzbauer's avatar Christian Kotzbauer Committed by GitHub
Browse files

Merge branch 'feature/helm-1.8.0' into prometheus-alert-firing-option-chart

parents afac9d43 7c33ad8b
......@@ -2,12 +2,12 @@ kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: "kindest/node:v1.20.2"
image: "kindest/node:v1.20.7"
- role: control-plane
image: "kindest/node:v1.20.2"
image: "kindest/node:v1.20.7"
- role: control-plane
image: "kindest/node:v1.20.2"
image: "kindest/node:v1.20.7"
- role: worker
image: "kindest/node:v1.20.2"
image: "kindest/node:v1.20.7"
- role: worker
image: "kindest/node:v1.20.2"
image: "kindest/node:v1.20.7"
......@@ -2,12 +2,12 @@ kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.21.1
image: kindest/node:v1.21.2
- role: control-plane
image: kindest/node:v1.21.1
image: kindest/node:v1.21.2
- role: control-plane
image: kindest/node:v1.21.1
image: kindest/node:v1.21.2
- role: worker
image: kindest/node:v1.21.1
image: kindest/node:v1.21.2
- role: worker
image: kindest/node:v1.21.1
image: kindest/node:v1.21.2
......@@ -2,12 +2,12 @@ kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
image: kindest/node:v1.19.7
image: kindest/node:v1.22.0
- role: control-plane
image: kindest/node:v1.19.7
image: kindest/node:v1.22.0
- role: control-plane
image: kindest/node:v1.19.7
image: kindest/node:v1.22.0
- role: worker
image: kindest/node:v1.19.7
image: kindest/node:v1.22.0
- role: worker
image: kindest/node:v1.19.7
image: kindest/node:v1.22.0
......@@ -69,7 +69,7 @@ jobs:
kubectl describe ds kured
- name: Test if successful deploy
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 10
max_attempts: 10
......
......@@ -14,7 +14,7 @@ jobs:
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@v0.4.0
uses: guyarb/golang-test-annoations@v0.5.0
with:
test-results: test.json
......@@ -97,9 +97,9 @@ jobs:
fail-fast: false
matrix:
kubernetes:
- "1.19"
- "1.20"
- "1.21"
- "1.22"
steps:
- uses: actions/checkout@v2
- name: Find go version
......@@ -144,7 +144,7 @@ jobs:
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml
- name: Ensure kured is ready
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 10
max_attempts: 10
......@@ -217,7 +217,7 @@ jobs:
kubectl describe ds kured
- name: Ensure kured is ready
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 10
max_attempts: 10
......@@ -226,7 +226,7 @@ jobs:
command: "kubectl get ds kured | grep -E 'kured.*1.*1.*1.*1.*1' "
- name: Get metrics (healthy)
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 2
max_attempts: 12
......@@ -238,7 +238,7 @@ jobs:
./tests/kind/create-reboot-sentinels.sh
- name: Get metrics (need reboot)
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 15
max_attempts: 10
......@@ -308,7 +308,7 @@ jobs:
# kubectl describe ds kured
#
# - name: Ensure kured is ready
# uses: nick-invision/retry@v2.4.1
# uses: nick-invision/retry@v2.5.0
# with:
# timeout_minutes: 10
# max_attempts: 10
......
......@@ -15,7 +15,7 @@ jobs:
run: go test -json ./... > test.json
- name: Annotate tests
if: always()
uses: guyarb/golang-test-annoations@v0.4.0
uses: guyarb/golang-test-annoations@v0.5.0
with:
test-results: test.json
......@@ -32,7 +32,7 @@ jobs:
stale-pr-message: 'This PR was automatically considered stale due to lack of activity. Please refresh it and/or join our slack channels to highlight it, before it automatically closes (in 7 days).'
stale-issue-label: 'no-issue-activity'
stale-pr-label: 'no-pr-activity'
exempt-issue-labels: ['good first issue', 'keep']
exempt-issue-labels: 'good first issue,keep'
days-before-close: 21
check-docs-links:
......@@ -74,9 +74,9 @@ jobs:
strategy:
matrix:
kubernetes:
- "1.19"
- "1.20"
- "1.21"
- "1.22"
steps:
- uses: actions/checkout@v2
- name: Find go version
......@@ -117,7 +117,7 @@ jobs:
kubectl describe ds kured
- name: Ensure kured is ready
uses: nick-invision/retry@v2.4.1
uses: nick-invision/retry@v2.5.0
with:
timeout_minutes: 10
max_attempts: 10
......
......@@ -187,19 +187,12 @@ Check that `README.md` has an updated compatibility matrix and that the
url in the `kubectl` incantation (under "Installation") is updated to the
new version you want to release.
### Create a tag on the repo and publish the image
### Create a tag on the repo
Before going further, we should freeze the code for a release, by
tagging the code, and publishing its immutable artifact: the kured
docker image.
tagging the code. The Github-Action should start a new job and push
the new image to the registry.
```sh
make DH_ORG="weaveworks" VERSION="1.3.0" image
```
Then docker push the image. In the future, that might be automatically
done when creating a tag on the repository, with the help of github
actions.
### Create the combined manifest
......@@ -237,3 +230,6 @@ A change in the helm chart requires a bump of the `version`
in `charts/kured/Chart.yaml` (following the versioning rules).
Update it, and issue a PR. Upon merge, that PR will automatically
publish the chart to the gh-pages branch.
When there are open helm-chart PRs which are on hold until the helm-chart has been updated
with the new kured version, they can be merged now (unless a rebase is needed from the contributor).
......@@ -2,3 +2,4 @@ Christian Kotzbauer <christian.kotzbauer@gmail.com> (@ckotzbauer)
Daniel Holbach <daniel@weave.works> (@dholbach)
Hidde Beydals <hidde@weave.works> (@hiddeco)
Jean-Phillipe Evrard <jean-philippe.evrard@suse.com> (@evrardjp)
Jack Francis <jackfrancis@gmail.com> (@jackfrancis)
......@@ -48,7 +48,7 @@ server:
| kured | kubectl | k8s.io/client-go | k8s.io/apimachinery | expected kubernetes compatibility |
|-------|---------|------------------|---------------------|-----------------------------------|
| main | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
| main | 1.21.4 | v0.21.4 | v0.21.4 | 1.20.x, 1.21.x, 1.22.x |
| 1.7.0 | 1.20.5 | v0.20.5 | v0.20.5 | 1.19.x, 1.20.x, 1.21.x |
| 1.6.1 | 1.19.4 | v0.19.4 | v0.19.4 | 1.18.x, 1.19.x, 1.20.x |
| 1.5.1 | 1.18.8 | v0.18.8 | v0.18.8 | 1.17.x, 1.18.x, 1.19.x |
......@@ -85,6 +85,7 @@ The following arguments can be passed to kured via the daemonset pod template:
```console
Flags:
--alert-filter-regexp regexp.Regexp alert names to ignore when checking for active alerts
--alert-firing-only bool only consider firing alerts when checking for active alerts
--blocking-pod-selector stringArray label selector identifying pods whose presence should prevent reboots
--drain-grace-period int time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
--skip-wait-for-delete-timeout int when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
......@@ -166,6 +167,11 @@ will block reboots, however you can ignore specific alerts:
--alert-filter-regexp=^(RebootRequired|AnotherBenignAlert|...$
```
You can also only block reboots for firing alerts:
```console
--alert-firing-only=true
```
See the section on Prometheus metrics for an important application of this
filter.
......
......@@ -2,7 +2,7 @@ apiVersion: v1
appVersion: "1.7.0"
description: A Helm chart for kured
name: kured
version: 2.9.0
version: 2.10.0
home: https://github.com/weaveworks/kured
maintainers:
- name: ckotzbauer
......
......@@ -39,7 +39,7 @@ The following changes have been made compared to the stable chart:
| `image.tag` | Image tag | `1.7.0` |
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
| `image.pullSecrets` | Image pull secrets | `[]` |
| `updateStrategy` | Daemonset update strategy | `OnDelete` |
| `updateStrategy` | Daemonset update strategy | `RollingUpdate` |
| `maxUnavailable` | The max pods unavailable during a rolling update | `1` |
| `podAnnotations` | Annotations to apply to pods (eg to add Prometheus annotations) | `{}` |
| `extraArgs` | Extra arguments to pass to `/usr/bin/kured`. See below. | `{}` |
......@@ -52,6 +52,10 @@ The following changes have been made compared to the stable chart:
| `configuration.endTime` | cli-parameter `--end-time` | `""` |
| `configuration.lockAnnotation` | cli-parameter `--lock-annotation` | `""` |
| `configuration.period` | cli-parameter `--period` | `""` |
| `configuration.forceReboot` | cli-parameter `--force-reboot` | `false` |
| `configuration.drainGracePeriod` | cli-parameter `--drain-grace-period` | `""` |
| `configuration.drainTimeout` | cli-parameter `--drain-timeout` | `""` |
| `configuration.skipWaitForDeleteTimeout` | cli-parameter `--skip-wait-for-delete-timeout` | `""` |
| `configuration.prometheusUrl` | cli-parameter `--prometheus-url` | `""` |
| `configuration.rebootDays` | Array of days for multiple cli-parameters `--reboot-days` | `[]` |
| `configuration.rebootSentinel` | cli-parameter `--reboot-sentinel` | `""` |
......
......@@ -76,6 +76,18 @@ spec:
{{- if .Values.configuration.period }}
- --period={{ .Values.configuration.period }}
{{- end }}
{{- if .Values.configuration.forceReboot }}
- --force-reboot
{{- end }}
{{- if .Values.configuration.drainGracePeriod }}
- --drain-grace-period={{ .Values.configuration.drainGracePeriod }}
{{- end }}
{{- if .Values.configuration.drainTimeout }}
- --drain-timeout={{ .Values.configuration.drainTimeout }}
{{- end }}
{{- if .Values.configuration.skipWaitForDeleteTimeout }}
- --skip-wait-for-delete-timeout={{ .Values.configuration.skipWaitForDeleteTimeout }}
{{- end }}
{{- if .Values.configuration.prometheusUrl }}
- --prometheus-url={{ .Values.configuration.prometheusUrl }}
{{- end }}
......
......@@ -3,13 +3,17 @@ image:
tag: latest
configuration:
# annotationTtl: 0 # force clean annotation after this ammount of time (default 0, disabled)
# annotationTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
# alertFilterRegexp: "" # alert names to ignore when checking for active alerts
# alertFiringOnly: false # only consider firing alerts when checking for active alerts
# blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
# endTime: "" # only reboot before this time of day (default "23:59")
# lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
period: "1m" # reboot check period (default 1h0m0s)
# forceReboot: false # force a reboot even if the drain fails or times out (default: false)
# drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
# drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
# skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
# prometheusUrl: "" # Prometheus instance to probe for active alerts
# rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
# rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
......
......@@ -4,7 +4,7 @@ image:
pullPolicy: IfNotPresent
pullSecrets: []
updateStrategy: OnDelete
updateStrategy: RollingUpdate
# requires RollingUpdate updateStrategy
maxUnavailable: 1
......@@ -22,13 +22,17 @@ extraEnvVars:
# value: 123
configuration:
lockTtl: 0 # force clean annotation after this ammount of time (default 0, disabled)
lockTtl: 0 # force clean annotation after this amount of time (default 0, disabled)
alertFilterRegexp: "" # alert names to ignore when checking for active alerts
alertFiringOnly: false # only consider firing alerts when checking for active alerts
blockingPodSelector: [] # label selector identifying pods whose presence should prevent reboots
endTime: "" # only reboot before this time of day (default "23:59")
lockAnnotation: "" # annotation in which to record locking node (default "weave.works/kured-node-lock")
period: "" # reboot check period (default 1h0m0s)
forceReboot: false # force a reboot even if the drain fails or times out (default: false)
drainGracePeriod: "" # time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)
drainTimeout: "" # timeout after which the drain is aborted (default: 0, infinite time)
skipWaitForDeleteTimeout: "" # when time is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)
prometheusUrl: "" # Prometheus instance to probe for active alerts
rebootDays: [] # only reboot on these days (default [su,mo,tu,we,th,fr,sa])
rebootSentinel: "" # path to file whose existence signals need to reboot (default "/var/run/reboot-required")
......
FROM alpine:3.13
FROM alpine:3.14
RUN apk update --no-cache && apk upgrade --no-cache && apk add --no-cache ca-certificates tzdata
COPY ./kured /usr/bin/kured
ENTRYPOINT ["/usr/bin/kured"]
......@@ -41,6 +41,7 @@ var (
// Command line flags
forceReboot bool
drainTimeout time.Duration
rebootDelay time.Duration
period time.Duration
drainGracePeriod int
skipWaitForDeleteTimeoutSeconds int
......@@ -52,6 +53,7 @@ var (
prometheusURL string
preferNoScheduleTaintName string
alertFilter *regexp.Regexp
alertFiringOnly bool
rebootSentinelFile string
rebootSentinelCommand string
notifyURL string
......@@ -98,13 +100,15 @@ func main() {
Run: root}
rootCmd.PersistentFlags().BoolVar(&forceReboot, "force-reboot", false,
"force a reboot even if the drain is still running (default: false)")
"force a reboot even if the drain fails or times out (default: false)")
rootCmd.PersistentFlags().IntVar(&drainGracePeriod, "drain-grace-period", -1,
"time in seconds given to each pod to terminate gracefully, if negative, the default value specified in the pod will be used (default: -1)")
rootCmd.PersistentFlags().IntVar(&skipWaitForDeleteTimeoutSeconds, "skip-wait-for-delete-timeout", 0,
"when seconds is greater than zero, skip waiting for the pods whose deletion timestamp is older than N seconds while draining a node (default: 0)")
rootCmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 0,
"timeout after which the drain is aborted (default: 0, infinite time)")
rootCmd.PersistentFlags().DurationVar(&rebootDelay, "reboot-delay", 0,
"delay reboot for this duration (default: 0, disabled)")
rootCmd.PersistentFlags().DurationVar(&period, "period", time.Minute*60,
"sentinel check period")
rootCmd.PersistentFlags().StringVar(&dsNamespace, "ds-namespace", "kube-system",
......@@ -121,6 +125,8 @@ func main() {
"Prometheus instance to probe for active alerts")
rootCmd.PersistentFlags().Var(&regexpValue{&alertFilter}, "alert-filter-regexp",
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().BoolVar(&alertFiringOnly, "alert-firing-only", false,
"only consider firing alerts when checking for active alerts (default: false)")
rootCmd.PersistentFlags().StringVar(&rebootSentinelFile, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence triggers the reboot command")
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "",
......@@ -234,6 +240,8 @@ type PrometheusBlockingChecker struct {
promClient *alerts.PromClient
// regexp used to get alerts
filter *regexp.Regexp
// bool to indicate if only firing alerts should be considered
firingOnly bool
}
// KubernetesBlockingChecker contains info for connecting
......@@ -248,7 +256,7 @@ type KubernetesBlockingChecker struct {
func (pb PrometheusBlockingChecker) isBlocked() bool {
alertNames, err := pb.promClient.ActiveAlerts(pb.filter)
alertNames, err := pb.promClient.ActiveAlerts(pb.filter, pb.firingOnly)
if err != nil {
log.Warnf("Reboot blocked: prometheus query error: %v", err)
return true
......@@ -393,6 +401,7 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) {
Client: client,
ErrOut: os.Stderr,
Out: os.Stdout,
Ctx: context.Background(),
}
if err := kubectldrain.RunCordonOrUncordon(drainer, node, false); err != nil {
log.Fatalf("Error uncordonning %s: %v", nodename, err)
......@@ -540,7 +549,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
var blockCheckers []RebootBlocker
if prometheusURL != "" {
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter})
blockCheckers = append(blockCheckers, PrometheusBlockingChecker{promClient: promClient, filter: alertFilter, firingOnly: alertFiringOnly})
}
if podSelectors != nil {
blockCheckers = append(blockCheckers, KubernetesBlockingChecker{client: client, nodename: nodeID, filter: podSelectors})
......@@ -576,6 +585,12 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
}
drain(client, node)
if rebootDelay > 0 {
log.Infof("Delaying reboot for %v", rebootDelay)
time.Sleep(rebootDelay)
}
invokeReboot(nodeID, rebootCommand)
for {
log.Infof("Waiting for reboot")
......
......@@ -32,7 +32,7 @@ func Test_rebootBlocked(t *testing.T) {
if err != nil {
log.Fatal("Can't create prometheusClient: ", err)
}
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil}
brokenPrometheusClient := PrometheusBlockingChecker{promClient: promClient, filter: nil, firingOnly: false}
type args struct {
blockers []RebootBlocker
......
This diff is collapsed.
......@@ -55,6 +55,7 @@ spec:
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment