feat: add prometheusrule for metrics service
This commit is contained in:
parent
e474d5dff8
commit
8f7beabe4c
@ -0,0 +1,32 @@
|
|||||||
|
# Prometheus Alter Rule Config
|
||||||
|
|
||||||
|
Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
|
||||||
|
see
|
||||||
|
```
|
||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ include "common.names.fullname" . }}
|
||||||
|
namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}}
|
||||||
|
labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }}
|
||||||
|
{{- if .Values.metrics.prometheusRule.additionalLabels }}
|
||||||
|
{{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .Values.commonAnnotations }}
|
||||||
|
annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.metrics.prometheusRule.rules }}
|
||||||
|
- name: {{ template "common.names.name" $ }}
|
||||||
|
rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
```
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.metrics.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.metrics.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.metrics.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.metrics.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.metrics.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.metrics.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -81,3 +81,30 @@ metrics:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-metrics
|
||||||
|
enabled: false
|
||||||
|
namespace: "freeleaps-monitoring-system"
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsMetricsServiceDown
|
||||||
|
expr: up{job="metrics-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
|
|
||||||
|
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
|
|
||||||
|
|||||||
@ -81,3 +81,29 @@ metrics:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-metrics
|
||||||
|
enabled: true
|
||||||
|
namespace: "freeleaps-monitoring-system"
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsMetricsServiceDown
|
||||||
|
expr: up{job="metrics-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
|
|
||||||
|
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
@ -55,12 +55,12 @@ metrics:
|
|||||||
port: 8009
|
port: 8009
|
||||||
targetPort: 8009
|
targetPort: 8009
|
||||||
serviceMonitor:
|
serviceMonitor:
|
||||||
enabled: false
|
enabled: true
|
||||||
labels:
|
labels:
|
||||||
release: kube-prometheus-stack
|
release: kube-prometheus-stack
|
||||||
namespace: freeleaps-monitoring-system
|
namespace: freeleaps-monitoring-system
|
||||||
internal: 30s
|
interval: 30s
|
||||||
scrapeTimeout: ''
|
scrapeTimeout: 10s
|
||||||
configs:
|
configs:
|
||||||
starrocksHost: ""
|
starrocksHost: ""
|
||||||
starrocksPort: 8009
|
starrocksPort: 8009
|
||||||
@ -80,3 +80,69 @@ metrics:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-metrics
|
||||||
|
enabled: true
|
||||||
|
namespace: "freeleaps-monitoring-system"
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsMetricsServiceDown
|
||||||
|
expr: up{job="metrics-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
|
|
||||||
|
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: metrics-service
|
||||||
|
annotations:
|
||||||
|
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
||||||
|
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
||||||
|
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||||
|
|
||||||
|
# - alert: MetricsServiceHighLatency
|
||||||
|
# expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1
|
||||||
|
# for: 5m
|
||||||
|
# labels:
|
||||||
|
# severity: warning
|
||||||
|
# service: metrics-service
|
||||||
|
# annotations:
|
||||||
|
# summary: "High latency in metrics service (instance {{ $labels.instance }})"
|
||||||
|
# description: "95th percentile latency is {{ $value }} seconds."
|
||||||
|
|
||||||
|
# - alert: MetricsServiceHighMemoryUsage
|
||||||
|
# expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512
|
||||||
|
# for: 5m
|
||||||
|
# labels:
|
||||||
|
# severity: warning
|
||||||
|
# service: metrics
|
||||||
|
# annotations:
|
||||||
|
# summary: "High memory usage in metrics service (instance {{ $labels.instance }})"
|
||||||
|
# description: "Memory usage is {{ $value }} MB."
|
||||||
|
|
||||||
|
# - alert: MetricsServiceHighCPUUsage
|
||||||
|
# expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80
|
||||||
|
# for: 5m
|
||||||
|
# labels:
|
||||||
|
# severity: warning
|
||||||
|
# service: metrics
|
||||||
|
# annotations:
|
||||||
|
# summary: "High CPU usage in metrics service (instance {{ $labels.instance }})"
|
||||||
|
# description: "CPU usage is {{ $value }}%."
|
||||||
|
|
||||||
|
# - alert: MetricsServiceNoData
|
||||||
|
# expr: absent(up{job="metrics-service"})
|
||||||
|
# for: 5m
|
||||||
|
# labels:
|
||||||
|
# severity: critical
|
||||||
|
# service: metrics
|
||||||
|
# annotations:
|
||||||
|
# summary: "No data from metrics service (instance {{ $labels.instance }})"
|
||||||
|
# description: "No metrics data received from metrics service for more than 5 minutes."
|
||||||
|
|||||||
@ -10,6 +10,27 @@ executeFreeleapsPipeline {
|
|||||||
executeMode = 'fully'
|
executeMode = 'fully'
|
||||||
commitMessageLintEnabled = false
|
commitMessageLintEnabled = false
|
||||||
components = [
|
components = [
|
||||||
|
[
|
||||||
|
name: 'metrics',
|
||||||
|
root: 'apps/metrics',
|
||||||
|
language: 'python',
|
||||||
|
dependenciesManager: 'pip',
|
||||||
|
requirementsFile: 'requirements.txt',
|
||||||
|
buildCacheEnabled: true,
|
||||||
|
buildAgentImage: 'python:3.12-slim',
|
||||||
|
buildArtifacts: ['.'],
|
||||||
|
lintEnabled: false,
|
||||||
|
sastEnabled: false,
|
||||||
|
imageRegistry: 'docker.io',
|
||||||
|
imageRepository: 'freeleaps',
|
||||||
|
imageName: 'devops',
|
||||||
|
imageBuilder: 'dind',
|
||||||
|
dockerfilePath: 'Dockerfile',
|
||||||
|
imageBuildRoot: '.',
|
||||||
|
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
|
||||||
|
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
|
||||||
|
semanticReleaseEnabled: true
|
||||||
|
],
|
||||||
[
|
[
|
||||||
name: 'authentication',
|
name: 'authentication',
|
||||||
root: 'apps/authentication',
|
root: 'apps/authentication',
|
||||||
@ -135,27 +156,6 @@ executeFreeleapsPipeline {
|
|||||||
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
|
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
|
||||||
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
|
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
|
||||||
semanticReleaseEnabled: true
|
semanticReleaseEnabled: true
|
||||||
],
|
|
||||||
[
|
|
||||||
name: 'metrics',
|
|
||||||
root: 'apps/metrcis',
|
|
||||||
language: 'python',
|
|
||||||
dependenciesManager: 'pip',
|
|
||||||
requirementsFile: 'requirements.txt',
|
|
||||||
buildCacheEnabled: true,
|
|
||||||
buildAgentImage: 'python:3.12-slim',
|
|
||||||
buildArtifacts: ['.'],
|
|
||||||
lintEnabled: false,
|
|
||||||
sastEnabled: false,
|
|
||||||
imageRegistry: 'docker.io',
|
|
||||||
imageRepository: 'freeleaps',
|
|
||||||
imageName: 'devops',
|
|
||||||
imageBuilder: 'dind',
|
|
||||||
dockerfilePath: 'Dockerfile',
|
|
||||||
imageBuildRoot: '.',
|
|
||||||
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
|
|
||||||
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
|
|
||||||
semanticReleaseEnabled: true
|
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue
Block a user