Skip to content

RabbitMQ 告警处理流程

概述

告警处理流程定义了从收到告警到问题解决的完整步骤。良好的告警处理流程可以确保问题得到及时有效的处理,最小化对业务的影响。本文将详细介绍告警处理流程的设计、实现和最佳实践。

核心知识点

告警处理流程

接收告警 -> 确认告警 -> 分析问题 -> 执行处理 -> 验证结果 -> 关闭告警

处理角色

角色职责
一线值班接收告警、初步响应
二线运维问题分析、方案制定
三线专家复杂问题、深层分析
架构师重大故障、架构调整

SLA 定义

严重程度响应时间解决时间
Critical15 分钟2 小时
Warning1 小时8 小时
Info24 小时无要求

配置示例

PHP 告警处理类

php
<?php

class AlertHandler
{
    private $stateStore;
    private $notifier;
    private $runbookResolver;
    private $autoActions;
    
    public function __construct($stateStore = null, $notifier = null)
    {
        $this->stateStore = $stateStore ?? new FileStateStore('/tmp/alert_handlers.json');
        $this->notifier = $notifier;
        $this->runbookResolver = new RunbookResolver();
        $this->autoActions = [];
    }
    
    public function handle(array $alert)
    {
        $existingAlert = $this->stateStore->get($alert['rule_name']);
        
        if ($existingAlert && $existingAlert['status'] === 'acknowledged') {
            return $this->updateAlert($alert, $existingAlert);
        }
        
        if ($existingAlert && $existingAlert['status'] === 'firing') {
            return $this->escalateAlert($alert, $existingAlert);
        }
        
        return $this->createNewAlert($alert);
    }
    
    private function createNewAlert($alert)
    {
        $handlerData = [
            'rule_name' => $alert['rule_name'],
            'alert' => $alert,
            'status' => 'new',
            'created_at' => time(),
            'updated_at' => time(),
            'acknowledged_by' => null,
            'acknowledged_at' => null,
            'resolved_by' => null,
            'resolved_at' => null,
            'notes' => [],
            'escalation_level' => 0,
        ];
        
        $this->stateStore->set($alert['rule_name'], $handlerData);
        
        $this->notifyNewAlert($alert);
        
        return $handlerData;
    }
    
    private function updateAlert($newAlert, $existingAlert)
    {
        if ($newAlert['status'] === 'resolved' && $existingAlert['status'] !== 'resolved') {
            $existingAlert['status'] = 'resolved';
            $existingAlert['resolved_at'] = time();
            $existingAlert['resolved_by'] = 'system';
            $existingAlert['alert']['status'] = 'resolved';
            
            $this->notifyResolved($existingAlert['alert']);
        } else {
            $existingAlert['updated_at'] = time();
            $existingAlert['alert'] = array_merge($existingAlert['alert'], $newAlert);
        }
        
        $this->stateStore->set($newAlert['rule_name'], $existingAlert);
        
        return $existingAlert;
    }
    
    private function escalateAlert($newAlert, $existingAlert)
    {
        $existingAlert['escalation_level']++;
        $existingAlert['updated_at'] = time();
        $existingAlert['last_escalated_at'] = time();
        
        $escalationActions = $this->getEscalationActions($existingAlert['escalation_level']);
        
        foreach ($escalationActions as $action) {
            $this->executeAction($action, $existingAlert);
        }
        
        $this->notifyEscalation($existingAlert);
        
        $this->stateStore->set($newAlert['rule_name'], $existingAlert);
        
        return $existingAlert;
    }
    
    public function acknowledge($ruleName, $user, $note = '')
    {
        $alert = $this->stateStore->get($ruleName);
        
        if (!$alert) {
            return ['success' => false, 'message' => 'Alert not found'];
        }
        
        $alert['status'] = 'acknowledged';
        $alert['acknowledged_by'] = $user;
        $alert['acknowledged_at'] = time();
        $alert['updated_at'] = time();
        
        if ($note) {
            $alert['notes'][] = [
                'type' => 'acknowledge',
                'user' => $user,
                'note' => $note,
                'timestamp' => time(),
            ];
        }
        
        $this->stateStore->set($ruleName, $alert);
        
        return ['success' => true, 'alert' => $alert];
    }
    
    public function resolve($ruleName, $user, $resolution = '', $note = '')
    {
        $alert = $this->stateStore->get($ruleName);
        
        if (!$alert) {
            return ['success' => false, 'message' => 'Alert not found'];
        }
        
        $alert['status'] = 'resolved';
        $alert['resolved_by'] = $user;
        $alert['resolved_at'] = time();
        $alert['resolution'] = $resolution;
        $alert['updated_at'] = time();
        
        if ($note || $resolution) {
            $alert['notes'][] = [
                'type' => 'resolve',
                'user' => $user,
                'note' => $note ?: $resolution,
                'timestamp' => time(),
            ];
        }
        
        $this->stateStore->set($ruleName, $alert);
        
        return ['success' => true, 'alert' => $alert];
    }
    
    public function addNote($ruleName, $user, $note)
    {
        $alert = $this->stateStore->get($ruleName);
        
        if (!$alert) {
            return ['success' => false, 'message' => 'Alert not found'];
        }
        
        $alert['notes'][] = [
            'type' => 'comment',
            'user' => $user,
            'note' => $note,
            'timestamp' => time(),
        ];
        $alert['updated_at'] = time();
        
        $this->stateStore->set($ruleName, $alert);
        
        return ['success' => true, 'alert' => $alert];
    }
    
    public function getActiveAlerts()
    {
        $allAlerts = $this->stateStore->getAll();
        
        return array_filter($allAlerts, function($alert) {
            return in_array($alert['status'], ['new', 'acknowledged', 'firing']);
        });
    }
    
    public function getAlertHistory($days = 30)
    {
        $allAlerts = $this->stateStore->getAll();
        $threshold = time() - ($days * 86400);
        
        return array_filter($allAlerts, function($alert) use ($threshold) {
            return $alert['created_at'] >= $threshold;
        });
    }
    
    public function getSLAStatus($ruleName)
    {
        $alert = $this->stateStore->get($ruleName);
        
        if (!$alert || $alert['status'] === 'resolved') {
            return ['sla_met' => true, 'message' => 'Alert resolved'];
        }
        
        $sla = $this->getSLAForAlert($alert);
        
        $elapsed = time() - $alert['created_at'];
        
        $sla['elapsed_seconds'] = $elapsed;
        $sla['elapsed_human'] = $this->formatDuration($elapsed);
        $sla['sla_met'] = $elapsed <= $sla['response_time'];
        
        return $sla;
    }
    
    private function getSLAForAlert($alert)
    {
        $severity = $alert['alert']['severity'] ?? 'info';
        
        $slas = [
            'critical' => ['response_time' => 900, 'resolution_time' => 7200],
            'warning' => ['response_time' => 3600, 'resolution_time' => 28800],
            'info' => ['response_time' => 86400, 'resolution_time' => PHP_INT_MAX],
        ];
        
        return $slas[$severity] ?? $slas['info'];
    }
    
    private function getEscalationActions($level)
    {
        $actions = [
            1 => [
                ['type' => 'notify', 'target' => 'second_line'],
            ],
            2 => [
                ['type' => 'notify', 'target' => 'third_line'],
                ['type' => 'notify', 'target' => 'manager'],
            ],
            3 => [
                ['type' => 'auto_action'],
                ['type' => 'notify', 'target' => 'director'],
            ],
        ];
        
        return $actions[$level] ?? [];
    }
    
    private function executeAction($action, $alert)
    {
        switch ($action['type']) {
            case 'notify':
                $this->executeNotifyAction($action, $alert);
                break;
            case 'auto_action':
                $this->executeAutoAction($alert);
                break;
        }
    }
    
    private function executeNotifyAction($action, $alert)
    {
        $targets = $this->getEscalationTargets($action['target']);
        
        foreach ($targets as $target) {
            $this->notifier->send([
                'name' => '告警升级: ' . $alert['alert']['name'],
                'severity' => 'critical',
                'status' => 'escalation',
                'message' => '告警未得到及时处理,已升级至 ' . $action['target'],
                'alert' => $alert,
            ], $target);
        }
    }
    
    private function executeAutoAction($alert)
    {
        $autoActions = $this->autoActions[$alert['rule_name']] ?? [];
        
        foreach ($autoActions as $action) {
            exec($action['command']);
        }
    }
    
    private function getEscalationTargets($level)
    {
        $targets = [
            'second_line' => ['email' => 'ops-secondary@example.com'],
            'third_line' => ['email' => 'ops-expert@example.com'],
            'manager' => ['sms' => '13800138000'],
            'director' => ['sms' => '13900139000'],
        ];
        
        return $targets[$level] ?? [];
    }
    
    private function notifyNewAlert($alert)
    {
        if (!$this->notifier) {
            return;
        }
        
        $this->notifier->send($alert);
    }
    
    private function notifyResolved($alert)
    {
        if (!$this->notifier) {
            return;
        }
        
        $alert['status'] = 'resolved';
        $alert['message'] = '告警已自动恢复: ' . ($alert['message'] ?? '');
        
        $this->notifier->send($alert);
    }
    
    private function notifyEscalation($alert)
    {
        if (!$this->notifier) {
            return;
        }
        
        $this->notifier->send([
            'name' => '告警升级: ' . $alert['alert']['name'],
            'severity' => 'critical',
            'status' => 'escalation',
            'message' => '告警已升级,级别: ' . $alert['escalation_level'],
        ]);
    }
    
    private function formatDuration($seconds)
    {
        $hours = floor($seconds / 3600);
        $minutes = floor(($seconds % 3600) / 60);
        
        if ($hours > 0) {
            return "{$hours}小时{$minutes}分钟";
        }
        
        return "{$minutes}分钟";
    }
}

class RunbookResolver
{
    private $runbooks = [];
    
    public function __construct()
    {
        $this->loadRunbooks();
    }
    
    private function loadRunbooks()
    {
        $this->runbooks = [
            'RabbitMQNodeDown' => [
                'title' => '节点宕机处理',
                'steps' => [
                    '1. 检查节点状态: rabbitmqctl status',
                    '2. 检查节点日志: tail -f /var/log/rabbitmq/rabbit.log',
                    '3. 检查系统资源: df -h, free -m',
                    '4. 尝试重启节点: rabbitmqctl start_app',
                    '5. 如果需要,执行故障转移',
                ],
                'contact' => 'ops-secondary@example.com',
            ],
            'RabbitMQMemoryHigh' => [
                'title' => '内存过高处理',
                'steps' => [
                    '1. 检查当前内存使用: rabbitmqctl status | grep memory',
                    '2. 查看连接和通道数',
                    '3. 检查消息堆积情况',
                    '4. 考虑增加内存限制',
                    '5. 优化消费者处理速度',
                ],
                'contact' => 'ops@example.com',
            ],
            'RabbitMQDiskSpaceLow' => [
                'title' => '磁盘空间不足处理',
                'steps' => [
                    '1. 检查磁盘使用: df -h',
                    '2. 清理日志文件: rm -rf /var/log/rabbitmq/*.log.*',
                    '3. 检查消息存储: du -sh /var/lib/rabbitmq/mnesia',
                    '4. 清理旧消息',
                    '5. 考虑扩展磁盘容量',
                ],
                'contact' => 'ops@example.com',
            ],
            'RabbitMQQueueMessagesHigh' => [
                'title' => '消息堆积处理',
                'steps' => [
                    '1. 查看队列状态: rabbitmqctl list_queues',
                    '2. 检查消费者状态',
                    '3. 分析消费速率: rabbitmqctl status',
                    '4. 增加消费者数量',
                    '5. 检查消费者日志',
                ],
                'contact' => 'queue-team@example.com',
            ],
            'RabbitMQClusterPartition' => [
                'title' => '集群分区处理',
                'steps' => [
                    '1. 检查集群状态: rabbitmqctl cluster_status',
                    '2. 检查网络连通性',
                    '3. 确定分区策略: pause_minority 或 autoheal',
                    '4. 执行分区处理',
                    '5. 验证数据一致性',
                ],
                'contact' => 'ops-expert@example.com',
            ],
        ];
    }
    
    public function resolve($alertName)
    {
        return $this->runbooks[$alertName] ?? null;
    }
    
    public function getRunbookUrl($alertName)
    {
        $runbook = $this->resolve($alertName);
        
        if (!$runbook) {
            return 'https://wiki.example.com/rabbitmq/runbooks';
        }
        
        return 'https://wiki.example.com/rabbitmq/runbooks/' . strtolower($alertName);
    }
}

告警处理工作流 API

php
<?php

class AlertWorkflowAPI
{
    private $handler;
    
    public function __construct(AlertHandler $handler)
    {
        $this->handler = $handler;
    }
    
    public function handleRequest($action, $params)
    {
        header('Content-Type: application/json');
        
        try {
            $result = match($action) {
                'list' => $this->listAlerts($params),
                'get' => $this->getAlert($params),
                'acknowledge' => $this->acknowledgeAlert($params),
                'resolve' => $this->resolveAlert($params),
                'note' => $this->addNote($params),
                'sla' => $this->getSLA($params),
                default => ['error' => 'Unknown action'],
            };
            
            echo json_encode($result);
        } catch (Exception $e) {
            http_response_code(500);
            echo json_encode(['error' => $e->getMessage()]);
        }
    }
    
    private function listAlerts($params)
    {
        $status = $params['status'] ?? null;
        
        $alerts = $this->handler->getActiveAlerts();
        
        if ($status) {
            $alerts = array_filter($alerts, fn($a) => $a['status'] === $status);
        }
        
        return [
            'success' => true,
            'alerts' => array_values($alerts),
            'total' => count($alerts),
        ];
    }
    
    private function getAlert($params)
    {
        $ruleName = $params['rule_name'] ?? null;
        
        if (!$ruleName) {
            return ['error' => 'rule_name is required'];
        }
        
        $alert = $this->handler->getAlertHistory(30)[$ruleName] ?? null;
        
        if (!$alert) {
            return ['error' => 'Alert not found'];
        }
        
        return ['success' => true, 'alert' => $alert];
    }
    
    private function acknowledgeAlert($params)
    {
        $ruleName = $params['rule_name'] ?? null;
        $user = $params['user'] ?? 'system';
        $note = $params['note'] ?? '';
        
        if (!$ruleName) {
            return ['error' => 'rule_name is required'];
        }
        
        return $this->handler->acknowledge($ruleName, $user, $note);
    }
    
    private function resolveAlert($params)
    {
        $ruleName = $params['rule_name'] ?? null;
        $user = $params['user'] ?? 'system';
        $resolution = $params['resolution'] ?? '';
        $note = $params['note'] ?? '';
        
        if (!$ruleName) {
            return ['error' => 'rule_name is required'];
        }
        
        return $this->handler->resolve($ruleName, $user, $resolution, $note);
    }
    
    private function addNote($params)
    {
        $ruleName = $params['rule_name'] ?? null;
        $user = $params['user'] ?? 'system';
        $note = $params['note'] ?? '';
        
        if (!$ruleName || !$note) {
            return ['error' => 'rule_name and note are required'];
        }
        
        return $this->handler->addNote($ruleName, $user, $note);
    }
    
    private function getSLA($params)
    {
        $ruleName = $params['rule_name'] ?? null;
        
        if (!$ruleName) {
            return ['error' => 'rule_name is required'];
        }
        
        return $this->handler->getSLAStatus($ruleName);
    }
}

class FileStateStore
{
    private $file;
    
    public function __construct($file)
    {
        $this->file = $file;
    }
    
    public function get($key)
    {
        $data = $this->load();
        return $data[$key] ?? null;
    }
    
    public function set($key, $value)
    {
        $data = $this->load();
        $data[$key] = $value;
        $this->save($data);
    }
    
    public function remove($key)
    {
        $data = $this->load();
        unset($data[$key]);
        $this->save($data);
    }
    
    public function getAll()
    {
        return $this->load();
    }
    
    private function load()
    {
        if (file_exists($this->file)) {
            return json_decode(file_get_contents($this->file), true) ?: [];
        }
        return [];
    }
    
    private function save($data)
    {
        $dir = dirname($this->file);
        if (!is_dir($dir)) {
            mkdir($dir, 0755, true);
        }
        file_put_contents($this->file, json_encode($data, JSON_PRETTY_PRINT));
    }
}

实际应用场景

场景一:自动化告警处理

php
<?php

class AutoAlertProcessor
{
    private $handler;
    private $rabbitmqClient;
    private $actions;
    
    public function __construct(AlertHandler $handler, $rabbitmqClient)
    {
        $this->handler = $handler;
        $this->rabbitmqClient = $rabbitmqClient;
        $this->registerActions();
    }
    
    private function registerActions()
    {
        $this->actions = [
            'RabbitMQMemoryHigh' => [$this, 'handleMemoryHigh'],
            'RabbitMQDiskSpaceLow' => [$this, 'handleDiskSpaceLow'],
            'RabbitMQQueueMessagesHigh' => [$this, 'handleQueueBacklog'],
            'RabbitMQNoConsumer' => [$this, 'handleNoConsumer'],
        ];
    }
    
    public function process($alert)
    {
        $action = $this->actions[$alert['rule_name']] ?? null;
        
        if (!$action) {
            return false;
        }
        
        return call_user_func($action, $alert);
    }
    
    private function handleMemoryHigh($alert)
    {
        $metrics = $this->rabbitmqClient->getOverview();
        
        $memoryPercent = ($metrics['node']['mem_used'] / $metrics['node']['mem_limit']) * 100;
        
        if ($memoryPercent > 90) {
            $this->clearOldMessages();
            $this->closeIdleConnections();
            
            return [
                'action' => 'emergency_cleanup',
                'executed' => true,
                'details' => 'Cleared old messages and idle connections',
            ];
        }
        
        return false;
    }
    
    private function handleDiskSpaceLow($alert)
    {
        $this->rotateLogs();
        $this->clearMessageStore();
        
        return [
            'action' => 'disk_cleanup',
            'executed' => true,
            'details' => 'Rotated logs and cleared message store',
        ];
    }
    
    private function handleQueueBacklog($alert)
    {
        $queueName = $alert['labels']['queue'] ?? null;
        
        if ($queueName) {
            $this->notifyQueueOwners($queueName);
            
            return [
                'action' => 'notify_queue_owners',
                'executed' => true,
                'queue' => $queueName,
            ];
        }
        
        return false;
    }
    
    private function handleNoConsumer($alert)
    {
        $queueName = $alert['labels']['queue'] ?? null;
        
        if ($queueName) {
            $this->triggerAlert('Queue has no consumer: ' . $queueName);
            
            return [
                'action' => 'alert_queue_team',
                'executed' => true,
                'queue' => $queueName,
            ];
        }
        
        return false;
    }
    
    private function clearOldMessages()
    {
    }
    
    private function closeIdleConnections()
    {
    }
    
    private function rotateLogs()
    {
    }
    
    private function clearMessageStore()
    {
    }
    
    private function notifyQueueOwners($queueName)
    {
    }
    
    private function triggerAlert($message)
    {
    }
}

场景二:告警统计分析

php
<?php

class AlertAnalytics
{
    private $handler;
    
    public function __construct(AlertHandler $handler)
    {
        $this->handler = $handler;
    }
    
    public function generateReport($days = 30)
    {
        $history = $this->handler->getAlertHistory($days);
        
        $report = [
            'period' => [
                'start' => date('Y-m-d H:i:s', time() - ($days * 86400)),
                'end' => date('Y-m-d H:i:s'),
            ],
            'summary' => $this->generateSummary($history),
            'by_severity' => $this->analyzeBySeverity($history),
            'by_rule' => $this->analyzeByRule($history),
            'sla_compliance' => $this->analyzeSLA($history),
            'response_times' => $this->analyzeResponseTimes($history),
            'top_issues' => $this->getTopIssues($history),
        ];
        
        return $report;
    }
    
    private function generateSummary($history)
    {
        $resolved = array_filter($history, fn($a) => $a['status'] === 'resolved');
        
        return [
            'total_alerts' => count($history),
            'active_alerts' => count($history) - count($resolved),
            'resolved_alerts' => count($resolved),
            'acknowledge_rate' => count($resolved) > 0 
                ? round((count(array_filter($resolved, fn($a) => $a['acknowledged_at'])) / count($resolved)) * 100, 2)
                : 0,
        ];
    }
    
    private function analyzeBySeverity($history)
    {
        $bySeverity = [];
        
        foreach ($history as $alert) {
            $severity = $alert['alert']['severity'] ?? 'unknown';
            $bySeverity[$severity] = ($bySeverity[$severity] ?? 0) + 1;
        }
        
        return $bySeverity;
    }
    
    private function analyzeByRule($history)
    {
        $byRule = [];
        
        foreach ($history as $alert) {
            $rule = $alert['rule_name'];
            $byRule[$rule] = ($byRule[$rule] ?? 0) + 1;
        }
        
        arsort($byRule);
        
        return $byRule;
    }
    
    private function analyzeSLA($history)
    {
        $resolved = array_filter($history, fn($a) => $a['status'] === 'resolved');
        
        $met = 0;
        $breached = 0;
        
        foreach ($resolved as $alert) {
            $sla = $this->handler->getSLAStatus($alert['rule_name']);
            
            if ($sla['sla_met']) {
                $met++;
            } else {
                $breached++;
            }
        }
        
        return [
            'met' => $met,
            'breached' => $breached,
            'compliance_rate' => ($met + $breached) > 0 
                ? round(($met / ($met + $breached)) * 100, 2) 
                : 100,
        ];
    }
    
    private function analyzeResponseTimes($history)
    {
        $responseTimes = [];
        
        foreach ($history as $alert) {
            if ($alert['acknowledged_at'] && $alert['created_at']) {
                $responseTimes[] = $alert['acknowledged_at'] - $alert['created_at'];
            }
        }
        
        if (empty($responseTimes)) {
            return ['avg' => 0, 'min' => 0, 'max' => 0];
        }
        
        return [
            'avg' => round(array_sum($responseTimes) / count($responseTimes)),
            'min' => min($responseTimes),
            'max' => max($responseTimes),
        ];
    }
    
    private function getTopIssues($history, $limit = 10)
    {
        $byRule = $this->analyzeByRule($history);
        
        return array_slice($byRule, 0, $limit, true);
    }
}

常见问题与解决方案

问题一:告警遗漏

现象:重要告警未被处理。

解决方案

php
class AlertVerification
{
    public function verifyAllAlertsHandled($handler)
    {
        $activeAlerts = $handler->getActiveAlerts();
        
        $unhandled = array_filter($activeAlerts, function($alert) {
            return $alert['status'] === 'new';
        });
        
        if (!empty($unhandled)) {
            $this->sendReminder($unhandled);
        }
        
        return [
            'total_active' => count($activeAlerts),
            'unhandled' => count($unhandled),
            'handled' => count($activeAlerts) - count($unhandled),
        ];
    }
}

问题二:重复告警

现象:同一问题产生多个告警。

解决方案

php
class AlertDeduplicator
{
    private $similarWindow = 300;
    
    public function deduplicate($alert, $history)
    {
        foreach ($history as $existing) {
            if ($existing['rule_name'] === $alert['rule_name']) {
                $timeDiff = abs($alert['created_at'] - $existing['created_at']);
                
                if ($timeDiff < $this->similarWindow) {
                    return $existing;
                }
            }
        }
        
        return null;
    }
}

问题三:SLA 跟踪困难

现象:难以追踪 SLA 执行情况。

解决方案

php
class SLATracker
{
    public function trackSLA($handler)
    {
        $activeAlerts = $handler->getActiveAlerts();
        
        $slaStatus = [
            'at_risk' => [],
            'breached' => [],
            'on_track' => [],
        ];
        
        foreach ($activeAlerts as $alert) {
            $sla = $handler->getSLAStatus($alert['rule_name']);
            
            if (!$sla['sla_met']) {
                if ($sla['elapsed_seconds'] > $sla['response_time']) {
                    $slaStatus['breached'][] = $alert;
                } else {
                    $slaStatus['at_risk'][] = $alert;
                }
            } else {
                $slaStatus['on_track'][] = $alert;
            }
        }
        
        return $slaStatus;
    }
}

最佳实践

1. 告警处理原则

  • 第一时间响应:收到告警立即确认
  • 及时升级:无法处理时及时升级
  • 完整记录:所有操作都要记录
  • 总结复盘:事后分析改进

2. 处理流程建议

php
$flow = [
    'new' => [
        'action' => '立即确认',
        'assignee' => '一线值班',
        'timeout' => 900,
    ],
    'acknowledged' => [
        'action' => '分析处理',
        'assignee' => '二线运维',
        'timeout' => 7200,
    ],
    'escalated' => [
        'action' => '升级处理',
        'assignee' => '三线专家',
        'timeout' => 3600,
    ],
];

3. 持续改进

  • 定期回顾告警数据
  • 优化告警阈值
  • 完善操作手册
  • 自动化常见处理

相关链接