alert_rules.yml 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. groups:
  2. - name: alert_rules
  3. rules:
  4. - alert: CpuUsageAlertWarning
  5. expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.60
  6. for: 2m
  7. labels:
  8. level: warning
  9. annotations:
  10. summary: "Instance {{ $labels.instance }} CPU usage high"
  11. description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})"
  12. - alert: CpuUsageAlertSerious
  13. #expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85
  14. expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 85
  15. for: 3m
  16. labels:
  17. level: serious
  18. annotations:
  19. summary: "Instance {{ $labels.instance }} CPU usage high"
  20. description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
  21. - alert: MemUsageAlertWarning
  22. expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70
  23. for: 2m
  24. labels:
  25. level: warning
  26. annotations:
  27. summary: "Instance {{ $labels.instance }} MEM usage high"
  28. description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})"
  29. - alert: MemUsageAlertSerious
  30. expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90
  31. for: 3m
  32. labels:
  33. level: serious
  34. annotations:
  35. summary: "Instance {{ $labels.instance }} MEM usage high"
  36. description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})"
  37. - alert: DiskUsageAlertWarning
  38. expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80
  39. for: 2m
  40. labels:
  41. level: warning
  42. annotations:
  43. summary: "Instance {{ $labels.instance }} Disk usage high"
  44. description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})"
  45. - alert: DiskUsageAlertSerious
  46. expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90
  47. for: 3m
  48. labels:
  49. level: serious
  50. annotations:
  51. summary: "Instance {{ $labels.instance }} Disk usage high"
  52. description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})"
  53. - alert: NodeFileDescriptorUsage
  54. expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60
  55. for: 2m
  56. labels:
  57. level: warning
  58. annotations:
  59. summary: "Instance {{ $labels.instance }} File Descriptor usage high"
  60. description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})"
  61. - alert: NodeLoad15
  62. expr: avg by (instance) (node_load15{}) > 80
  63. for: 2m
  64. labels:
  65. level: warning
  66. annotations:
  67. summary: "Instance {{ $labels.instance }} Load15 usage high"
  68. description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})"
  69. - alert: NodeAgentStatus
  70. expr: avg by (instance) (up{}) == 0
  71. for: 2m
  72. labels:
  73. level: warning
  74. annotations:
  75. summary: "{{$labels.instance}}: has been down"
  76. description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})"
  77. - alert: NodeProcsBlocked
  78. expr: avg by (instance) (node_procs_blocked{}) > 10
  79. for: 2m
  80. labels:
  81. level: warning
  82. annotations:
  83. summary: "Instance {{ $labels.instance }} Process Blocked usage high"
  84. description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})"
  85. - alert: NetworkTransmitRate
  86. #expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
  87. expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
  88. for: 1m
  89. labels:
  90. level: warning
  91. annotations:
  92. summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high"
  93. description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
  94. - alert: NetworkReceiveRate
  95. #expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
  96. expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
  97. for: 1m
  98. labels:
  99. level: warning
  100. annotations:
  101. summary: "Instance {{ $labels.instance }} Network Receive Rate usage high"
  102. description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
  103. - alert: DiskReadRate
  104. expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200
  105. for: 2m
  106. labels:
  107. level: warning
  108. annotations:
  109. summary: "Instance {{ $labels.instance }} Disk Read Rate usage high"
  110. description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)"
  111. - alert: DiskWriteRate
  112. expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20
  113. for: 2m
  114. labels:
  115. level: warning
  116. annotations:
  117. summary: "Instance {{ $labels.instance }} Disk Write Rate usage high"
  118. description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)"