# AegisBPF Prometheus Alert Rules
# Import this file into your Prometheus configuration
#
# Example prometheus.yml:
#   rule_files:
#     - /etc/prometheus/rules/aegisbpf.yml

groups:
  - name: aegisbpf
    interval: 30s
    rules:
      # Agent availability
      - alert: AegisBPFAgentDown
        expr: up{job="aegisbpf"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "AegisBPF agent is down"
          description: "AegisBPF agent on {{ $labels.instance }} has been unreachable for more than 2 minutes."
          runbook: "docs/runbooks/INCIDENT_agent_crash.md"

      # High block rate - potential attack or misconfiguration
      - alert: AegisBPFHighBlockRate
        expr: rate(aegisbpf_blocks_total[5m]) > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High execution block rate detected"
          description: "AegisBPF on {{ $labels.instance }} is blocking more than 100 executions per minute. Current rate: {{ $value | printf \"%.2f\" }}/min."
          runbook: "docs/runbooks/ALERT_high_block_rate.md"

      # Very high block rate - likely attack
      - alert: AegisBPFVeryHighBlockRate
        expr: rate(aegisbpf_blocks_total[1m]) > 1000
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Very high execution block rate - possible attack"
          description: "AegisBPF on {{ $labels.instance }} is blocking more than 1000 executions per minute. Current rate: {{ $value | printf \"%.2f\" }}/min. Investigate immediately."
          runbook: "docs/runbooks/ALERT_high_block_rate.md"

      # Ring buffer drops - events being lost
      - alert: AegisBPFRingbufDrops
        expr: rate(aegisbpf_ringbuf_drops_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "AegisBPF ring buffer is dropping events"
          description: "AegisBPF on {{ $labels.instance }} is losing events due to ring buffer overflow. Drop rate: {{ $value | printf \"%.2f\" }}/s. Consider increasing buffer size or reducing event volume."
          runbook: "docs/runbooks/ALERT_ringbuf_drops.md"

      # SLO violation: event loss ratio > 0.1%
      - alert: AegisBPFEventLossSLOViolation
        expr: rate(aegisbpf_ringbuf_drops_total[5m]) / (rate(aegisbpf_blocks_total[5m]) + rate(aegisbpf_ringbuf_drops_total[5m]) + 1) > 0.001
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "AegisBPF event loss SLO violated"
          description: "AegisBPF on {{ $labels.instance }} exceeds the 0.1% event loss SLO for 15m."
          runbook: "docs/runbooks/ALERT_ringbuf_drops.md"

      # High ring buffer drop rate
      - alert: AegisBPFHighRingbufDrops
        expr: rate(aegisbpf_ringbuf_drops_total[1m]) > 100
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "AegisBPF high ring buffer drop rate"
          description: "AegisBPF on {{ $labels.instance }} is losing more than 100 events/second. Security monitoring may be compromised."
          runbook: "docs/runbooks/ALERT_ringbuf_drops.md"

      # Empty policy - no deny rules configured
      - alert: AegisBPFPolicyEmpty
        expr: aegisbpf_deny_inode_entries == 0 and aegisbpf_deny_path_entries == 0
        for: 10m
        labels:
          severity: info
        annotations:
          summary: "AegisBPF policy is empty"
          description: "AegisBPF on {{ $labels.instance }} has no deny rules configured. The agent is running but not blocking anything."
          runbook: "docs/runbooks/MAINTENANCE_policy_update.md"

      # Policy too large - approaching map limits
      - alert: AegisBPFPolicyNearLimit
        expr: aegisbpf_deny_inode_entries > 9000 or aegisbpf_deny_path_entries > 9000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "AegisBPF policy approaching size limit"
          description: "AegisBPF on {{ $labels.instance }} has {{ $value }} entries, approaching the 10,000 entry limit. Consider consolidating rules."
          runbook: "docs/runbooks/MAINTENANCE_policy_update.md"

      # Network block spikes (connect/bind)
      - alert: AegisBPFNetworkBlockRate
        expr: sum by (instance, type) (rate(aegisbpf_net_blocks_total[5m])) > 25
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network block rate ({{ $labels.type }})"
          description: "AegisBPF on {{ $labels.instance }} is blocking >25 network operations/sec for type={{ $labels.type }}."
          runbook: "docs/runbooks/ALERT_high_block_rate.md"

      # Network event drops
      - alert: AegisBPFNetworkRingbufDrops
        expr: rate(aegisbpf_net_ringbuf_drops_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "AegisBPF network ring buffer is dropping events"
          description: "AegisBPF on {{ $labels.instance }} is dropping network events. Drop rate: {{ $value | printf \"%.2f\" }}/s."
          runbook: "docs/runbooks/ALERT_ringbuf_drops.md"

      # SLO violation: network event loss ratio > 0.1%
      - alert: AegisBPFNetworkEventLossSLOViolation
        expr: rate(aegisbpf_net_ringbuf_drops_total[5m]) / (sum by (instance) (rate(aegisbpf_net_blocks_total[5m])) + rate(aegisbpf_net_ringbuf_drops_total[5m]) + 1) > 0.001
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "AegisBPF network event loss SLO violated"
          description: "AegisBPF on {{ $labels.instance }} exceeds the 0.1% network event loss SLO for 15m."
          runbook: "docs/runbooks/ALERT_ringbuf_drops.md"

      # Agent metrics stale - scrape working but counters not updating
      - alert: AegisBPFMetricsStale
        expr: changes(aegisbpf_blocks_total[30m]) == 0 and aegisbpf_deny_inode_entries > 0
        for: 30m
        labels:
          severity: info
        annotations:
          summary: "AegisBPF metrics appear stale"
          description: "AegisBPF on {{ $labels.instance }} has a policy configured but no block events in 30 minutes. Verify the agent is functioning correctly."
          runbook: "docs/runbooks/INCIDENT_agent_crash.md"
