Appearance
RabbitMQ 告警处理流程
概述
告警处理流程定义了从收到告警到问题解决的完整步骤。良好的告警处理流程可以确保问题得到及时有效的处理,最小化对业务的影响。本文将详细介绍告警处理流程的设计、实现和最佳实践。
核心知识点
告警处理流程
接收告警 -> 确认告警 -> 分析问题 -> 执行处理 -> 验证结果 -> 关闭告警处理角色
| 角色 | 职责 |
|---|---|
| 一线值班 | 接收告警、初步响应 |
| 二线运维 | 问题分析、方案制定 |
| 三线专家 | 复杂问题、深层分析 |
| 架构师 | 重大故障、架构调整 |
SLA 定义
| 严重程度 | 响应时间 | 解决时间 |
|---|---|---|
| Critical | 15 分钟 | 2 小时 |
| Warning | 1 小时 | 8 小时 |
| Info | 24 小时 | 无要求 |
配置示例
PHP 告警处理类
php
<?php
class AlertHandler
{
private $stateStore;
private $notifier;
private $runbookResolver;
private $autoActions;
public function __construct($stateStore = null, $notifier = null)
{
$this->stateStore = $stateStore ?? new FileStateStore('/tmp/alert_handlers.json');
$this->notifier = $notifier;
$this->runbookResolver = new RunbookResolver();
$this->autoActions = [];
}
public function handle(array $alert)
{
$existingAlert = $this->stateStore->get($alert['rule_name']);
if ($existingAlert && $existingAlert['status'] === 'acknowledged') {
return $this->updateAlert($alert, $existingAlert);
}
if ($existingAlert && $existingAlert['status'] === 'firing') {
return $this->escalateAlert($alert, $existingAlert);
}
return $this->createNewAlert($alert);
}
private function createNewAlert($alert)
{
$handlerData = [
'rule_name' => $alert['rule_name'],
'alert' => $alert,
'status' => 'new',
'created_at' => time(),
'updated_at' => time(),
'acknowledged_by' => null,
'acknowledged_at' => null,
'resolved_by' => null,
'resolved_at' => null,
'notes' => [],
'escalation_level' => 0,
];
$this->stateStore->set($alert['rule_name'], $handlerData);
$this->notifyNewAlert($alert);
return $handlerData;
}
private function updateAlert($newAlert, $existingAlert)
{
if ($newAlert['status'] === 'resolved' && $existingAlert['status'] !== 'resolved') {
$existingAlert['status'] = 'resolved';
$existingAlert['resolved_at'] = time();
$existingAlert['resolved_by'] = 'system';
$existingAlert['alert']['status'] = 'resolved';
$this->notifyResolved($existingAlert['alert']);
} else {
$existingAlert['updated_at'] = time();
$existingAlert['alert'] = array_merge($existingAlert['alert'], $newAlert);
}
$this->stateStore->set($newAlert['rule_name'], $existingAlert);
return $existingAlert;
}
private function escalateAlert($newAlert, $existingAlert)
{
$existingAlert['escalation_level']++;
$existingAlert['updated_at'] = time();
$existingAlert['last_escalated_at'] = time();
$escalationActions = $this->getEscalationActions($existingAlert['escalation_level']);
foreach ($escalationActions as $action) {
$this->executeAction($action, $existingAlert);
}
$this->notifyEscalation($existingAlert);
$this->stateStore->set($newAlert['rule_name'], $existingAlert);
return $existingAlert;
}
public function acknowledge($ruleName, $user, $note = '')
{
$alert = $this->stateStore->get($ruleName);
if (!$alert) {
return ['success' => false, 'message' => 'Alert not found'];
}
$alert['status'] = 'acknowledged';
$alert['acknowledged_by'] = $user;
$alert['acknowledged_at'] = time();
$alert['updated_at'] = time();
if ($note) {
$alert['notes'][] = [
'type' => 'acknowledge',
'user' => $user,
'note' => $note,
'timestamp' => time(),
];
}
$this->stateStore->set($ruleName, $alert);
return ['success' => true, 'alert' => $alert];
}
public function resolve($ruleName, $user, $resolution = '', $note = '')
{
$alert = $this->stateStore->get($ruleName);
if (!$alert) {
return ['success' => false, 'message' => 'Alert not found'];
}
$alert['status'] = 'resolved';
$alert['resolved_by'] = $user;
$alert['resolved_at'] = time();
$alert['resolution'] = $resolution;
$alert['updated_at'] = time();
if ($note || $resolution) {
$alert['notes'][] = [
'type' => 'resolve',
'user' => $user,
'note' => $note ?: $resolution,
'timestamp' => time(),
];
}
$this->stateStore->set($ruleName, $alert);
return ['success' => true, 'alert' => $alert];
}
public function addNote($ruleName, $user, $note)
{
$alert = $this->stateStore->get($ruleName);
if (!$alert) {
return ['success' => false, 'message' => 'Alert not found'];
}
$alert['notes'][] = [
'type' => 'comment',
'user' => $user,
'note' => $note,
'timestamp' => time(),
];
$alert['updated_at'] = time();
$this->stateStore->set($ruleName, $alert);
return ['success' => true, 'alert' => $alert];
}
public function getActiveAlerts()
{
$allAlerts = $this->stateStore->getAll();
return array_filter($allAlerts, function($alert) {
return in_array($alert['status'], ['new', 'acknowledged', 'firing']);
});
}
public function getAlertHistory($days = 30)
{
$allAlerts = $this->stateStore->getAll();
$threshold = time() - ($days * 86400);
return array_filter($allAlerts, function($alert) use ($threshold) {
return $alert['created_at'] >= $threshold;
});
}
public function getSLAStatus($ruleName)
{
$alert = $this->stateStore->get($ruleName);
if (!$alert || $alert['status'] === 'resolved') {
return ['sla_met' => true, 'message' => 'Alert resolved'];
}
$sla = $this->getSLAForAlert($alert);
$elapsed = time() - $alert['created_at'];
$sla['elapsed_seconds'] = $elapsed;
$sla['elapsed_human'] = $this->formatDuration($elapsed);
$sla['sla_met'] = $elapsed <= $sla['response_time'];
return $sla;
}
private function getSLAForAlert($alert)
{
$severity = $alert['alert']['severity'] ?? 'info';
$slas = [
'critical' => ['response_time' => 900, 'resolution_time' => 7200],
'warning' => ['response_time' => 3600, 'resolution_time' => 28800],
'info' => ['response_time' => 86400, 'resolution_time' => PHP_INT_MAX],
];
return $slas[$severity] ?? $slas['info'];
}
private function getEscalationActions($level)
{
$actions = [
1 => [
['type' => 'notify', 'target' => 'second_line'],
],
2 => [
['type' => 'notify', 'target' => 'third_line'],
['type' => 'notify', 'target' => 'manager'],
],
3 => [
['type' => 'auto_action'],
['type' => 'notify', 'target' => 'director'],
],
];
return $actions[$level] ?? [];
}
private function executeAction($action, $alert)
{
switch ($action['type']) {
case 'notify':
$this->executeNotifyAction($action, $alert);
break;
case 'auto_action':
$this->executeAutoAction($alert);
break;
}
}
private function executeNotifyAction($action, $alert)
{
$targets = $this->getEscalationTargets($action['target']);
foreach ($targets as $target) {
$this->notifier->send([
'name' => '告警升级: ' . $alert['alert']['name'],
'severity' => 'critical',
'status' => 'escalation',
'message' => '告警未得到及时处理,已升级至 ' . $action['target'],
'alert' => $alert,
], $target);
}
}
private function executeAutoAction($alert)
{
$autoActions = $this->autoActions[$alert['rule_name']] ?? [];
foreach ($autoActions as $action) {
exec($action['command']);
}
}
private function getEscalationTargets($level)
{
$targets = [
'second_line' => ['email' => 'ops-secondary@example.com'],
'third_line' => ['email' => 'ops-expert@example.com'],
'manager' => ['sms' => '13800138000'],
'director' => ['sms' => '13900139000'],
];
return $targets[$level] ?? [];
}
private function notifyNewAlert($alert)
{
if (!$this->notifier) {
return;
}
$this->notifier->send($alert);
}
private function notifyResolved($alert)
{
if (!$this->notifier) {
return;
}
$alert['status'] = 'resolved';
$alert['message'] = '告警已自动恢复: ' . ($alert['message'] ?? '');
$this->notifier->send($alert);
}
private function notifyEscalation($alert)
{
if (!$this->notifier) {
return;
}
$this->notifier->send([
'name' => '告警升级: ' . $alert['alert']['name'],
'severity' => 'critical',
'status' => 'escalation',
'message' => '告警已升级,级别: ' . $alert['escalation_level'],
]);
}
private function formatDuration($seconds)
{
$hours = floor($seconds / 3600);
$minutes = floor(($seconds % 3600) / 60);
if ($hours > 0) {
return "{$hours}小时{$minutes}分钟";
}
return "{$minutes}分钟";
}
}
class RunbookResolver
{
private $runbooks = [];
public function __construct()
{
$this->loadRunbooks();
}
private function loadRunbooks()
{
$this->runbooks = [
'RabbitMQNodeDown' => [
'title' => '节点宕机处理',
'steps' => [
'1. 检查节点状态: rabbitmqctl status',
'2. 检查节点日志: tail -f /var/log/rabbitmq/rabbit.log',
'3. 检查系统资源: df -h, free -m',
'4. 尝试重启节点: rabbitmqctl start_app',
'5. 如果需要,执行故障转移',
],
'contact' => 'ops-secondary@example.com',
],
'RabbitMQMemoryHigh' => [
'title' => '内存过高处理',
'steps' => [
'1. 检查当前内存使用: rabbitmqctl status | grep memory',
'2. 查看连接和通道数',
'3. 检查消息堆积情况',
'4. 考虑增加内存限制',
'5. 优化消费者处理速度',
],
'contact' => 'ops@example.com',
],
'RabbitMQDiskSpaceLow' => [
'title' => '磁盘空间不足处理',
'steps' => [
'1. 检查磁盘使用: df -h',
'2. 清理日志文件: rm -rf /var/log/rabbitmq/*.log.*',
'3. 检查消息存储: du -sh /var/lib/rabbitmq/mnesia',
'4. 清理旧消息',
'5. 考虑扩展磁盘容量',
],
'contact' => 'ops@example.com',
],
'RabbitMQQueueMessagesHigh' => [
'title' => '消息堆积处理',
'steps' => [
'1. 查看队列状态: rabbitmqctl list_queues',
'2. 检查消费者状态',
'3. 分析消费速率: rabbitmqctl status',
'4. 增加消费者数量',
'5. 检查消费者日志',
],
'contact' => 'queue-team@example.com',
],
'RabbitMQClusterPartition' => [
'title' => '集群分区处理',
'steps' => [
'1. 检查集群状态: rabbitmqctl cluster_status',
'2. 检查网络连通性',
'3. 确定分区策略: pause_minority 或 autoheal',
'4. 执行分区处理',
'5. 验证数据一致性',
],
'contact' => 'ops-expert@example.com',
],
];
}
public function resolve($alertName)
{
return $this->runbooks[$alertName] ?? null;
}
public function getRunbookUrl($alertName)
{
$runbook = $this->resolve($alertName);
if (!$runbook) {
return 'https://wiki.example.com/rabbitmq/runbooks';
}
return 'https://wiki.example.com/rabbitmq/runbooks/' . strtolower($alertName);
}
}告警处理工作流 API
php
<?php
class AlertWorkflowAPI
{
private $handler;
public function __construct(AlertHandler $handler)
{
$this->handler = $handler;
}
public function handleRequest($action, $params)
{
header('Content-Type: application/json');
try {
$result = match($action) {
'list' => $this->listAlerts($params),
'get' => $this->getAlert($params),
'acknowledge' => $this->acknowledgeAlert($params),
'resolve' => $this->resolveAlert($params),
'note' => $this->addNote($params),
'sla' => $this->getSLA($params),
default => ['error' => 'Unknown action'],
};
echo json_encode($result);
} catch (Exception $e) {
http_response_code(500);
echo json_encode(['error' => $e->getMessage()]);
}
}
private function listAlerts($params)
{
$status = $params['status'] ?? null;
$alerts = $this->handler->getActiveAlerts();
if ($status) {
$alerts = array_filter($alerts, fn($a) => $a['status'] === $status);
}
return [
'success' => true,
'alerts' => array_values($alerts),
'total' => count($alerts),
];
}
private function getAlert($params)
{
$ruleName = $params['rule_name'] ?? null;
if (!$ruleName) {
return ['error' => 'rule_name is required'];
}
$alert = $this->handler->getAlertHistory(30)[$ruleName] ?? null;
if (!$alert) {
return ['error' => 'Alert not found'];
}
return ['success' => true, 'alert' => $alert];
}
private function acknowledgeAlert($params)
{
$ruleName = $params['rule_name'] ?? null;
$user = $params['user'] ?? 'system';
$note = $params['note'] ?? '';
if (!$ruleName) {
return ['error' => 'rule_name is required'];
}
return $this->handler->acknowledge($ruleName, $user, $note);
}
private function resolveAlert($params)
{
$ruleName = $params['rule_name'] ?? null;
$user = $params['user'] ?? 'system';
$resolution = $params['resolution'] ?? '';
$note = $params['note'] ?? '';
if (!$ruleName) {
return ['error' => 'rule_name is required'];
}
return $this->handler->resolve($ruleName, $user, $resolution, $note);
}
private function addNote($params)
{
$ruleName = $params['rule_name'] ?? null;
$user = $params['user'] ?? 'system';
$note = $params['note'] ?? '';
if (!$ruleName || !$note) {
return ['error' => 'rule_name and note are required'];
}
return $this->handler->addNote($ruleName, $user, $note);
}
private function getSLA($params)
{
$ruleName = $params['rule_name'] ?? null;
if (!$ruleName) {
return ['error' => 'rule_name is required'];
}
return $this->handler->getSLAStatus($ruleName);
}
}
class FileStateStore
{
private $file;
public function __construct($file)
{
$this->file = $file;
}
public function get($key)
{
$data = $this->load();
return $data[$key] ?? null;
}
public function set($key, $value)
{
$data = $this->load();
$data[$key] = $value;
$this->save($data);
}
public function remove($key)
{
$data = $this->load();
unset($data[$key]);
$this->save($data);
}
public function getAll()
{
return $this->load();
}
private function load()
{
if (file_exists($this->file)) {
return json_decode(file_get_contents($this->file), true) ?: [];
}
return [];
}
private function save($data)
{
$dir = dirname($this->file);
if (!is_dir($dir)) {
mkdir($dir, 0755, true);
}
file_put_contents($this->file, json_encode($data, JSON_PRETTY_PRINT));
}
}实际应用场景
场景一:自动化告警处理
php
<?php
class AutoAlertProcessor
{
private $handler;
private $rabbitmqClient;
private $actions;
public function __construct(AlertHandler $handler, $rabbitmqClient)
{
$this->handler = $handler;
$this->rabbitmqClient = $rabbitmqClient;
$this->registerActions();
}
private function registerActions()
{
$this->actions = [
'RabbitMQMemoryHigh' => [$this, 'handleMemoryHigh'],
'RabbitMQDiskSpaceLow' => [$this, 'handleDiskSpaceLow'],
'RabbitMQQueueMessagesHigh' => [$this, 'handleQueueBacklog'],
'RabbitMQNoConsumer' => [$this, 'handleNoConsumer'],
];
}
public function process($alert)
{
$action = $this->actions[$alert['rule_name']] ?? null;
if (!$action) {
return false;
}
return call_user_func($action, $alert);
}
private function handleMemoryHigh($alert)
{
$metrics = $this->rabbitmqClient->getOverview();
$memoryPercent = ($metrics['node']['mem_used'] / $metrics['node']['mem_limit']) * 100;
if ($memoryPercent > 90) {
$this->clearOldMessages();
$this->closeIdleConnections();
return [
'action' => 'emergency_cleanup',
'executed' => true,
'details' => 'Cleared old messages and idle connections',
];
}
return false;
}
private function handleDiskSpaceLow($alert)
{
$this->rotateLogs();
$this->clearMessageStore();
return [
'action' => 'disk_cleanup',
'executed' => true,
'details' => 'Rotated logs and cleared message store',
];
}
private function handleQueueBacklog($alert)
{
$queueName = $alert['labels']['queue'] ?? null;
if ($queueName) {
$this->notifyQueueOwners($queueName);
return [
'action' => 'notify_queue_owners',
'executed' => true,
'queue' => $queueName,
];
}
return false;
}
private function handleNoConsumer($alert)
{
$queueName = $alert['labels']['queue'] ?? null;
if ($queueName) {
$this->triggerAlert('Queue has no consumer: ' . $queueName);
return [
'action' => 'alert_queue_team',
'executed' => true,
'queue' => $queueName,
];
}
return false;
}
private function clearOldMessages()
{
}
private function closeIdleConnections()
{
}
private function rotateLogs()
{
}
private function clearMessageStore()
{
}
private function notifyQueueOwners($queueName)
{
}
private function triggerAlert($message)
{
}
}场景二:告警统计分析
php
<?php
class AlertAnalytics
{
private $handler;
public function __construct(AlertHandler $handler)
{
$this->handler = $handler;
}
public function generateReport($days = 30)
{
$history = $this->handler->getAlertHistory($days);
$report = [
'period' => [
'start' => date('Y-m-d H:i:s', time() - ($days * 86400)),
'end' => date('Y-m-d H:i:s'),
],
'summary' => $this->generateSummary($history),
'by_severity' => $this->analyzeBySeverity($history),
'by_rule' => $this->analyzeByRule($history),
'sla_compliance' => $this->analyzeSLA($history),
'response_times' => $this->analyzeResponseTimes($history),
'top_issues' => $this->getTopIssues($history),
];
return $report;
}
private function generateSummary($history)
{
$resolved = array_filter($history, fn($a) => $a['status'] === 'resolved');
return [
'total_alerts' => count($history),
'active_alerts' => count($history) - count($resolved),
'resolved_alerts' => count($resolved),
'acknowledge_rate' => count($resolved) > 0
? round((count(array_filter($resolved, fn($a) => $a['acknowledged_at'])) / count($resolved)) * 100, 2)
: 0,
];
}
private function analyzeBySeverity($history)
{
$bySeverity = [];
foreach ($history as $alert) {
$severity = $alert['alert']['severity'] ?? 'unknown';
$bySeverity[$severity] = ($bySeverity[$severity] ?? 0) + 1;
}
return $bySeverity;
}
private function analyzeByRule($history)
{
$byRule = [];
foreach ($history as $alert) {
$rule = $alert['rule_name'];
$byRule[$rule] = ($byRule[$rule] ?? 0) + 1;
}
arsort($byRule);
return $byRule;
}
private function analyzeSLA($history)
{
$resolved = array_filter($history, fn($a) => $a['status'] === 'resolved');
$met = 0;
$breached = 0;
foreach ($resolved as $alert) {
$sla = $this->handler->getSLAStatus($alert['rule_name']);
if ($sla['sla_met']) {
$met++;
} else {
$breached++;
}
}
return [
'met' => $met,
'breached' => $breached,
'compliance_rate' => ($met + $breached) > 0
? round(($met / ($met + $breached)) * 100, 2)
: 100,
];
}
private function analyzeResponseTimes($history)
{
$responseTimes = [];
foreach ($history as $alert) {
if ($alert['acknowledged_at'] && $alert['created_at']) {
$responseTimes[] = $alert['acknowledged_at'] - $alert['created_at'];
}
}
if (empty($responseTimes)) {
return ['avg' => 0, 'min' => 0, 'max' => 0];
}
return [
'avg' => round(array_sum($responseTimes) / count($responseTimes)),
'min' => min($responseTimes),
'max' => max($responseTimes),
];
}
private function getTopIssues($history, $limit = 10)
{
$byRule = $this->analyzeByRule($history);
return array_slice($byRule, 0, $limit, true);
}
}常见问题与解决方案
问题一:告警遗漏
现象:重要告警未被处理。
解决方案:
php
class AlertVerification
{
public function verifyAllAlertsHandled($handler)
{
$activeAlerts = $handler->getActiveAlerts();
$unhandled = array_filter($activeAlerts, function($alert) {
return $alert['status'] === 'new';
});
if (!empty($unhandled)) {
$this->sendReminder($unhandled);
}
return [
'total_active' => count($activeAlerts),
'unhandled' => count($unhandled),
'handled' => count($activeAlerts) - count($unhandled),
];
}
}问题二:重复告警
现象:同一问题产生多个告警。
解决方案:
php
class AlertDeduplicator
{
private $similarWindow = 300;
public function deduplicate($alert, $history)
{
foreach ($history as $existing) {
if ($existing['rule_name'] === $alert['rule_name']) {
$timeDiff = abs($alert['created_at'] - $existing['created_at']);
if ($timeDiff < $this->similarWindow) {
return $existing;
}
}
}
return null;
}
}问题三:SLA 跟踪困难
现象:难以追踪 SLA 执行情况。
解决方案:
php
class SLATracker
{
public function trackSLA($handler)
{
$activeAlerts = $handler->getActiveAlerts();
$slaStatus = [
'at_risk' => [],
'breached' => [],
'on_track' => [],
];
foreach ($activeAlerts as $alert) {
$sla = $handler->getSLAStatus($alert['rule_name']);
if (!$sla['sla_met']) {
if ($sla['elapsed_seconds'] > $sla['response_time']) {
$slaStatus['breached'][] = $alert;
} else {
$slaStatus['at_risk'][] = $alert;
}
} else {
$slaStatus['on_track'][] = $alert;
}
}
return $slaStatus;
}
}最佳实践
1. 告警处理原则
- 第一时间响应:收到告警立即确认
- 及时升级:无法处理时及时升级
- 完整记录:所有操作都要记录
- 总结复盘:事后分析改进
2. 处理流程建议
php
$flow = [
'new' => [
'action' => '立即确认',
'assignee' => '一线值班',
'timeout' => 900,
],
'acknowledged' => [
'action' => '分析处理',
'assignee' => '二线运维',
'timeout' => 7200,
],
'escalated' => [
'action' => '升级处理',
'assignee' => '三线专家',
'timeout' => 3600,
],
];3. 持续改进
- 定期回顾告警数据
- 优化告警阈值
- 完善操作手册
- 自动化常见处理
