# severity按严重程度由高到低:red、orange、yello、bluegroups:- name:jvm-alertingrules:# down了超过30秒- alert:instance-downexpr:up == 0for:30slabels:severity:yellowannotations:summary:"Instance {{ $labels.instance }} down"description:"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 30 seconds."# down了超过1分钟- alert:instance-downexpr:up == 0for:1mlabels:severity:orangeannotations:summary:"Instance {{ $labels.instance }} down"description:"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."# down了超过5分钟- alert:instance-downexpr:up == 0for:5mlabels:severity:redannotations:summary:"Instance {{ $labels.instance }} down"description:"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."# 堆空间使用超过50%- alert:heap-usage-too-muchexpr:jvm_memory_bytes_used{job="java", area="heap"} / jvm_memory_bytes_max * 100 > 50for:1mlabels:severity:yellowannotations:summary:"JVM Instance {{ $labels.instance }} memory usage > 50%"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 50%] for more than 1 minutes. current usage ({{ $value }}%)"# 堆空间使用超过80%- alert:heap-usage-too-muchexpr:jvm_memory_bytes_used{job="java", area="heap"} / jvm_memory_bytes_max * 100 > 80for:1mlabels:severity:orangeannotations:summary:"JVM Instance {{ $labels.instance }} memory usage > 80%"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 80%] for more than 1 minutes. current usage ({{ $value }}%)"# 堆空间使用超过90%- alert:heap-usage-too-muchexpr:jvm_memory_bytes_used{job="java", area="heap"} / jvm_memory_bytes_max * 100 > 90for:1mlabels:severity:redannotations:summary:"JVM Instance {{ $labels.instance }} memory usage > 90%"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 90%] for more than 1 minutes. current usage ({{ $value }}%)"# 在5分钟里,Old GC花费时间超过30%- alert:old-gc-time-too-muchexpr:increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.3for:5mlabels:severity:yellowannotations:summary:"JVM Instance {{ $labels.instance }} Old GC time > 30% running time"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 30% running time] for more than 5 minutes. current seconds ({{ $value }}%)"# 在5分钟里,Old GC花费时间超过50% - alert:old-gc-time-too-muchexpr:increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.5for:5mlabels:severity:orangeannotations:summary:"JVM Instance {{ $labels.instance }} Old GC time > 50% running time"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 50% running time] for more than 5 minutes. current seconds ({{ $value }}%)"# 在5分钟里,Old GC花费时间超过80%- alert:old-gc-time-too-muchexpr:increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.8for:5mlabels:severity:redannotations:summary:"JVM Instance {{ $labels.instance }} Old GC time > 80% running time"description:"{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 80% running time] for more than 5 minutes. current seconds ({{ $value }}%)"
global:smtp_smarthost:'<smtp.host:ip>'smtp_from:'<from>'smtp_auth_username:'<username>'smtp_auth_password:'<password>'# The directory from which notification templates are read.templates:- '/alertmanager-config/*.tmpl'# The root route on which each incoming alert enters.route:# The labels by which incoming alerts are grouped together. For example,# multiple alerts coming in for cluster=A and alertname=LatencyHigh would# be batched into a single group.group_by:['alertname','instance']# When a new group of alerts is created by an incoming alert, wait at# least 'group_wait' to send the initial notification.# This way ensures that you get multiple alerts for the same group that start# firing shortly after another are batched together on the first # notification.group_wait:30s# When the first notification was sent, wait 'group_interval' to send a batch# of new alerts that started firing for that group.group_interval:5m# If an alert has successfully been sent, wait 'repeat_interval' to# resend them.repeat_interval:3h # A default receiverreceiver:"user-a"# Inhibition rules allow to mute a set of alerts given that another alert is# firing.# We use this to mute any warning-level notifications if the same alert is # already critical.inhibit_rules:- source_match:severity:'red'target_match_re:severity:^(blue|yellow|orange)$# Apply inhibition if the alertname and instance is the same.equal:['alertname','instance']- source_match:severity:'orange'target_match_re:severity:^(blue|yellow)$# Apply inhibition if the alertname and instance is the same.equal:['alertname','instance']- source_match:severity:'yellow'target_match_re:severity:^(blue)$# Apply inhibition if the alertname and instance is the same.equal:['alertname','instance']receivers:- name:'user-a'email_configs:- to:'<user-a@domain.com>'
评论