10-Jenkins_备份恢复与故障排查
·
Jenkins备份恢复与故障排查:构建可靠的CI/CD基础设施
1. 备份策略设计
1.1 备份架构
1.2 备份范围规划
| 数据类型 | 重要性 | 备份频率 | 保留周期 | 恢复优先级 |
|---|---|---|---|---|
| config.xml | 高 | 每次变更 | 永久 | P0 |
| credentials.xml | 高 | 每次变更 | 90天 | P0 |
| jobs/ | 高 | 每日增量 | 30天 | P1 |
| plugins/ | 中 | 每周 | 90天 | P2 |
| builds/ | 中 | 每日增量 | 30天 | P3 |
| users/ | 中 | 每日 | 90天 | P2 |
| workspace/ | 低 | 不备份 | - | P4 |
1.3 备份策略矩阵
# 备份策略配置
backup_strategy:
# 关键配置备份
critical_config:
frequency: "on_change" # 变更时备份
retention: "permanent" # 永久保留
encryption: true
components:
- "config.xml"
- "credentials.xml"
- "jenkins.model.JenkinsLocationConfiguration.xml"
- "hudson.model.UpdateCenter.xml"
# 任务定义备份
job_definitions:
frequency: "hourly"
retention: "30_days"
encryption: true
components:
- "jobs/*/config.xml"
- "jobs/*/nextBuildNumber"
# 构建历史备份
build_history:
frequency: "daily"
retention: "14_days"
encryption: false
components:
- "jobs/*/builds/*/build.xml"
- "jobs/*/builds/*/log"
- "jobs/*/builds/*/changelog.xml"
# 插件备份
plugins:
frequency: "weekly"
retention: "90_days"
encryption: false
components:
- "plugins/*.jpi"
- "plugins/*.jpi.pinned"
- "plugins/*.jpi.disabled"
# 完整备份
full_backup:
frequency: "weekly"
retention: "90_days"
encryption: true
compression: true
components:
- "entire JENKINS_HOME"
2. 备份方案实施
2.1 使用ThinBackup插件
/**
* ThinBackup插件配置
* 通过JCasC配置
*/
jenkins:
thinBackup:
# 备份目录
backupPath: "/backup/jenkins"
# 完整备份计划
fullBackupSchedule: "H 2 * * 0" # 每周日凌晨2点
# 增量备份计划
differentialBackupSchedule: "H 2 * * 1-6" # 周一到周六凌晨2点
# 备份保留策略
backupMaxDaysInStorageAge: 90 # 保留90天
backupMaxDaysInStorageAgeFull: 180 # 完整备份保留180天
nrMaxStoredFull: 10 # 最多保留10个完整备份
# 备份内容
backupBuildResults: true
backupBuildArchive: true
backupConfigFiles: true
backupGlobalConfiguration: true
backupUserContents: false
backupNextBuildNumber: true
# 其他选项
quietMode: true # 备份时进入静默模式
moveOldBackupsToZip: true # 压缩旧备份
cleanupDisabled: false
2.2 自定义备份脚本
#!/bin/bash
# Jenkins备份脚本
# 文件: /opt/jenkins/scripts/backup.sh
set -e
# ==================== 配置 ====================
JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}"
BACKUP_DIR="${BACKUP_DIR:-/backup/jenkins}"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_NAME="jenkins_backup_${DATE}"
BACKUP_PATH="${BACKUP_DIR}/${BACKUP_NAME}"
RETENTION_DAYS=30
S3_BUCKET="s3://jenkins-backups"
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# ==================== 准备备份 ====================
prepare_backup() {
log "Preparing backup..."
# 创建备份目录
mkdir -p "${BACKUP_PATH}"
# 进入静默模式(可选)
# curl -X POST "http://localhost:8080/quietDown"
# 等待正在进行的构建完成(可选)
# wait_for_builds
}
# ==================== 备份配置 ====================
backup_config() {
log "Backing up configuration..."
# 核心配置文件
cp "${JENKINS_HOME}/config.xml" "${BACKUP_PATH}/"
cp "${JENKINS_HOME}/credentials.xml" "${BACKUP_PATH}/" 2>/dev/null || true
cp "${JENKINS_HOME}/hudson.model.UpdateCenter.xml" "${BACKUP_PATH}/" 2>/dev/null || true
cp "${JENKINS_HOME}/jenkins.model.JenkinsLocationConfiguration.xml" "${BACKUP_PATH}/" 2>/dev/null || true
# 备份密钥(重要!)
cp -r "${JENKINS_HOME}/secrets" "${BACKUP_PATH}/" 2>/dev/null || true
# 用户配置
cp -r "${JENKINS_HOME}/users" "${BACKUP_PATH}/" 2>/dev/null || true
}
# ==================== 备份任务 ====================
backup_jobs() {
log "Backing up jobs..."
mkdir -p "${BACKUP_PATH}/jobs"
# 备份所有任务配置
find "${JENKINS_HOME}/jobs" -name "config.xml" | while read config; do
job_dir=$(dirname "${config}")
job_name=$(basename "${job_dir}")
# 创建任务备份目录
mkdir -p "${BACKUP_PATH}/jobs/${job_name}"
# 备份配置和构建号
cp "${config}" "${BACKUP_PATH}/jobs/${job_name}/"
cp "${job_dir}/nextBuildNumber" "${BACKUP_PATH}/jobs/${job_name}/" 2>/dev/null || true
# 备份最近的构建历史(可选)
if [ "${BACKUP_BUILDS:-false}" = "true" ]; then
if [ -d "${job_dir}/builds" ]; then
# 只备份最近10个构建
recent_builds=$(ls -t "${job_dir}/builds" | head -10)
for build in ${recent_builds}; do
mkdir -p "${BACKUP_PATH}/jobs/${job_name}/builds/${build}"
cp "${job_dir}/builds/${build}/build.xml" "${BACKUP_PATH}/jobs/${job_name}/builds/${build}/" 2>/dev/null || true
cp "${job_dir}/builds/${build}/log" "${BACKUP_PATH}/jobs/${job_name}/builds/${build}/" 2>/dev/null || true
done
fi
fi
done
}
# ==================== 备份插件 ====================
backup_plugins() {
log "Backing up plugins..."
mkdir -p "${BACKUP_PATH}/plugins"
# 备份插件列表
ls "${JENKINS_HOME}/plugins" > "${BACKUP_PATH}/plugins/plugin_list.txt"
# 备份插件文件(可选,较大)
if [ "${BACKUP_PLUGINS:-false}" = "true" ]; then
cp "${JENKINS_HOME}/plugins/"*.jpi "${BACKUP_PATH}/plugins/" 2>/dev/null || true
cp "${JENKINS_HOME}/plugins/"*.jpi.pinned "${BACKUP_PATH}/plugins/" 2>/dev/null || true
fi
}
# ==================== 备份视图 ====================
backup_views() {
log "Backing up views..."
# 备份视图配置
find "${JENKINS_HOME}" -maxdepth 1 -name "*.xml" -type f | while read xml; do
cp "${xml}" "${BACKUP_PATH}/"
done
}
# ==================== 创建备份元数据 ====================
create_metadata() {
log "Creating backup metadata..."
cat > "${BACKUP_PATH}/backup_metadata.json" << EOF
{
"backup_date": "$(date -Iseconds)",
"jenkins_version": "$(cat ${JENKINS_HOME}/config.xml | grep -oP '(?<=<version>)[^<]+')",
"backup_type": "${BACKUP_TYPE:-full}",
"hostname": "$(hostname)",
"jenkins_home": "${JENKINS_HOME}",
"backup_size": "$(du -sh ${BACKUP_PATH} | cut -f1)",
"checksum": "$(find ${BACKUP_PATH} -type f -exec sha256sum {} \; | sha256sum | cut -d' ' -f1)"
}
EOF
}
# ==================== 压缩备份 ====================
compress_backup() {
log "Compressing backup..."
cd "${BACKUP_DIR}"
tar -czf "${BACKUP_NAME}.tar.gz" "${BACKUP_NAME}"
rm -rf "${BACKUP_NAME}"
log "Backup created: ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
}
# ==================== 加密备份 ====================
encrypt_backup() {
if [ "${ENCRYPT_BACKUP:-false}" = "true" ]; then
log "Encrypting backup..."
gpg --symmetric --cipher-algo AES256 \
--passphrase-file "${ENCRYPTION_KEY_FILE}" \
"${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
rm "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
mv "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg" "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg"
log "Backup encrypted: ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg"
fi
}
# ==================== 上传到远程存储 ====================
upload_to_remote() {
log "Uploading to remote storage..."
# 上传到S3
if command -v aws &> /dev/null; then
aws s3 cp "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"* "${S3_BUCKET}/$(date +%Y/%m/%d)/"
log "Uploaded to S3: ${S3_BUCKET}/$(date +%Y/%m/%d)/"
fi
# 或使用rsync
# rsync -avz "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"* backup-server:/backup/jenkins/
}
# ==================== 清理旧备份 ====================
cleanup_old_backups() {
log "Cleaning up old backups..."
# 删除本地旧备份
find "${BACKUP_DIR}" -name "jenkins_backup_*.tar.gz*" -mtime +${RETENTION_DAYS} -delete
# 删除S3旧备份
if command -v aws &> /dev/null; then
aws s3 ls "${S3_BUCKET}/" --recursive | \
awk '$1 < "'$(date -d "-${RETENTION_DAYS} days" +%Y-%m-%d)'" {print $4}' | \
while read key; do
aws s3 rm "${S3_BUCKET}/${key}"
done
fi
}
# ==================== 验证备份 ====================
verify_backup() {
log "Verifying backup..."
local backup_file="${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
if [ -f "${backup_file}" ]; then
# 检查压缩文件完整性
if tar -tzf "${backup_file}" > /dev/null 2>&1; then
log "Backup verification: PASSED"
return 0
else
log "Backup verification: FAILED"
return 1
fi
else
log "Backup file not found: ${backup_file}"
return 1
fi
}
# ==================== 主函数 ====================
main() {
log "Starting Jenkins backup..."
prepare_backup
backup_config
backup_jobs
backup_plugins
backup_views
create_metadata
compress_backup
encrypt_backup
upload_to_remote
cleanup_old_backups
verify_backup
log "Backup completed successfully!"
}
# 执行备份
main "$@"
2.3 定时备份配置
# crontab配置
# 文件: /etc/cron.d/jenkins-backup
# 每日凌晨2点执行增量备份
0 2 * * * jenkins BACKUP_TYPE=incremental /opt/jenkins/scripts/backup.sh >> /var/log/jenkins/backup.log 2>&1
# 每周日凌晨3点执行完整备份
0 3 * * 0 jenkins BACKUP_TYPE=full BACKUP_BUILDS=true BACKUP_PLUGINS=true /opt/jenkins/scripts/backup.sh >> /var/log/jenkins/backup.log 2>&1
# 每小时备份关键配置
0 * * * * jenkins BACKUP_TYPE=critical /opt/jenkins/scripts/backup-critical.sh >> /var/log/jenkins/backup-critical.log 2>&1
3. 恢复流程详解
3.1 恢复流程架构
3.2 恢复脚本
#!/bin/bash
# Jenkins恢复脚本
# 文件: /opt/jenkins/scripts/restore.sh
set -e
# ==================== 配置 ====================
JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}"
BACKUP_DIR="${BACKUP_DIR:-/backup/jenkins}"
RESTORE_LOG="/var/log/jenkins/restore.log"
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "${RESTORE_LOG}"
}
# ==================== 检查备份文件 ====================
check_backup() {
local backup_file=$1
if [ ! -f "${backup_file}" ]; then
log "ERROR: Backup file not found: ${backup_file}"
exit 1
fi
# 验证备份完整性
log "Verifying backup integrity..."
if ! tar -tzf "${backup_file}" > /dev/null 2>&1; then
log "ERROR: Backup file is corrupted"
exit 1
fi
log "Backup verification passed"
}
# ==================== 停止Jenkins ====================
stop_jenkins() {
log "Stopping Jenkins..."
if systemctl is-active --quiet jenkins; then
systemctl stop jenkins
log "Jenkins stopped"
else
log "Jenkins is not running"
fi
}
# ==================== 备份当前状态 ====================
backup_current_state() {
log "Backing up current state..."
local current_backup="${BACKUP_DIR}/pre_restore_$(date +%Y%m%d_%H%M%S).tar.gz"
tar -czf "${current_backup}" -C "${JENKINS_HOME}" .
log "Current state backed up to: ${current_backup}"
}
# ==================== 恢复备份 ====================
restore_backup() {
local backup_file=$1
log "Restoring from backup: ${backup_file}"
# 解密(如果需要)
if [[ "${backup_file}" == *.gpg ]]; then
log "Decrypting backup..."
gpg --decrypt --passphrase-file "${ENCRYPTION_KEY_FILE}" "${backup_file}" > "${backup_file%.gpg}"
backup_file="${backup_file%.gpg}"
fi
# 清空JENKINS_HOME
log "Clearing JENKINS_HOME..."
rm -rf "${JENKINS_HOME:?}"/*
# 解压备份
log "Extracting backup..."
tar -xzf "${backup_file}" -C "${JENKINS_HOME}"
# 设置权限
log "Setting permissions..."
chown -R jenkins:jenkins "${JENKINS_HOME}"
chmod -R 755 "${JENKINS_HOME}"
chmod 700 "${JENKINS_HOME}/secrets"
log "Restore completed"
}
# ==================== 验证恢复 ====================
verify_restore() {
log "Verifying restore..."
# 检查关键文件
local required_files=(
"${JENKINS_HOME}/config.xml"
"${JENKINS_HOME}/secrets/master.key"
"${JENKINS_HOME}/secrets/hudson.util.Secret"
)
for file in "${required_files[@]}"; do
if [ ! -f "${file}" ]; then
log "ERROR: Required file missing: ${file}"
return 1
fi
done
log "Restore verification passed"
}
# ==================== 启动Jenkins ====================
start_jenkins() {
log "Starting Jenkins..."
systemctl start jenkins
# 等待Jenkins启动
local max_wait=120
local wait=0
while [ ${wait} -lt ${max_wait} ]; do
if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/login" | grep -q "200"; then
log "Jenkins is up and running"
return 0
fi
sleep 5
wait=$((wait + 5))
log "Waiting for Jenkins to start... (${wait}s)"
done
log "ERROR: Jenkins failed to start within ${max_wait}s"
return 1
}
# ==================== 验证功能 ====================
verify_functionality() {
log "Verifying Jenkins functionality..."
# 检查API响应
local api_response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/json")
if [ "${api_response}" = "200" ]; then
log "API check: PASSED"
else
log "WARNING: API check failed with status ${api_response}"
fi
# 检查任务数量
local job_count=$(curl -s "http://localhost:8080/api/json?tree=jobs[name]" | jq '.jobs | length')
log "Jobs restored: ${job_count}"
# 检查插件状态
local plugin_count=$(curl -s "http://localhost:8080/pluginManager/api/json?depth=1" | jq '.plugins | length')
log "Plugins loaded: ${plugin_count}"
}
# ==================== 主函数 ====================
main() {
local backup_file=$1
if [ -z "${backup_file}" ]; then
# 列出可用备份
log "Available backups:"
ls -lt "${BACKUP_DIR}"/jenkins_backup_*.tar.gz* 2>/dev/null | head -10
echo ""
read -p "Enter backup file to restore: " backup_file
fi
log "Starting Jenkins restore from: ${backup_file}"
check_backup "${backup_file}"
stop_jenkins
backup_current_state
restore_backup "${backup_file}"
verify_restore
start_jenkins
verify_functionality
log "Restore completed successfully!"
}
# 执行恢复
main "$@"
3.3 部分恢复场景
#!/bin/bash
# Jenkins部分恢复脚本
# ==================== 恢复单个任务 ====================
restore_single_job() {
local job_name=$1
local backup_file=$2
log "Restoring job: ${job_name}"
# 创建临时目录
local temp_dir=$(mktemp -d)
# 解压备份
tar -xzf "${backup_file}" -C "${temp_dir}"
# 恢复任务配置
if [ -d "${temp_dir}/jobs/${job_name}" ]; then
# 备份当前配置
if [ -d "${JENKINS_HOME}/jobs/${job_name}" ]; then
mv "${JENKINS_HOME}/jobs/${job_name}" "${JENKINS_HOME}/jobs/${job_name}.bak"
fi
# 恢复配置
mkdir -p "${JENKINS_HOME}/jobs/${job_name}"
cp -r "${temp_dir}/jobs/${job_name}/config.xml" "${JENKINS_HOME}/jobs/${job_name}/"
# 重新加载任务
curl -X POST "http://localhost:8080/reload"
log "Job ${job_name} restored successfully"
else
log "ERROR: Job ${job_name} not found in backup"
fi
# 清理临时目录
rm -rf "${temp_dir}"
}
# ==================== 恢复凭证 ====================
restore_credentials() {
local backup_file=$1
log "Restoring credentials..."
# 停止Jenkins
systemctl stop jenkins
# 创建临时目录
local temp_dir=$(mktemp -d)
# 解压备份
tar -xzf "${backup_file}" -C "${temp_dir}"
# 恢复凭证文件
if [ -f "${temp_dir}/credentials.xml" ]; then
cp "${temp_dir}/credentials.xml" "${JENKINS_HOME}/"
chown jenkins:jenkins "${JENKINS_HOME}/credentials.xml"
chmod 600 "${JENKINS_HOME}/credentials.xml"
fi
# 恢复密钥
if [ -d "${temp_dir}/secrets" ]; then
cp -r "${temp_dir}/secrets" "${JENKINS_HOME}/"
chown -R jenkins:jenkins "${JENKINS_HOME}/secrets"
chmod -R 700 "${JENKINS_HOME}/secrets"
fi
# 清理并启动
rm -rf "${temp_dir}"
systemctl start jenkins
log "Credentials restored"
}
# ==================== 恢复插件 ====================
restore_plugins() {
local backup_file=$1
log "Restoring plugins..."
# 创建临时目录
local temp_dir=$(mktemp -d)
# 解压备份
tar -xzf "${backup_file}" -C "${temp_dir}"
# 恢复插件
if [ -d "${temp_dir}/plugins" ]; then
cp -r "${temp_dir}/plugins/"*.jpi "${JENKINS_HOME}/plugins/" 2>/dev/null || true
cp -r "${temp_dir}/plugins/"*.jpi.pinned "${JENKINS_HOME}/plugins/" 2>/dev/null || true
chown -R jenkins:jenkins "${JENKINS_HOME}/plugins"
fi
# 清理
rm -rf "${temp_dir}"
# 重启Jenkins
systemctl restart jenkins
log "Plugins restored"
}
4. 高可用架构
4.1 高可用架构设计
4.2 主备切换配置
# Keepalived配置
# 文件: /etc/keepalived/keepalived.conf
vrrp_script check_jenkins {
script "/usr/local/bin/check_jenkins.sh"
interval 5
fall 3
rise 2
}
vrrp_instance VI_1 {
state MASTER # 备节点设为BACKUP
interface eth0
virtual_router_id 51
priority 100 # 备节点设为90
advert_int 1
authentication {
auth_type PASS
auth_pass jenkins-ha
}
virtual_ipaddress {
192.168.1.100/24
}
track_script {
check_jenkins
}
notify_master "/usr/local/bin/jenkins_master.sh"
notify_backup "/usr/local/bin/jenkins_backup.sh"
}
---
# 健康检查脚本
# 文件: /usr/local/bin/check_jenkins.sh
#!/bin/bash
# 检查Jenkins是否健康
JENKINS_URL="http://localhost:8080"
# 检查进程
if ! pgrep -f "jenkins.war" > /dev/null; then
exit 1
fi
# 检查HTTP响应
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${JENKINS_URL}/login")
if [ "${HTTP_CODE}" != "200" ]; then
exit 1
fi
exit 0
---
# 切换为Master脚本
# 文件: /usr/local/bin/jenkins_master.sh
#!/bin/bash
# 切换为Master节点
logger "Jenkins HA: Switching to MASTER"
# 启动Jenkins(如果未运行)
if ! systemctl is-active --quiet jenkins; then
systemctl start jenkins
fi
# 挂载共享存储
mount -t nfs nfs-server:/jenkins /var/lib/jenkins
# 发送通知
curl -X POST -H 'Content-Type: application/json' \
-d '{"text":"Jenkins HA: Switched to MASTER on '$(hostname)'"}' \
"${SLACK_WEBHOOK_URL}"
---
# 切换为Backup脚本
# 文件: /usr/local/bin/jenkins_backup.sh
#!/bin/bash
# 切换为Backup节点
logger "Jenkins HA: Switching to BACKUP"
# 停止Jenkins
systemctl stop jenkins
# 卸载共享存储
umount /var/lib/jenkins
# 发送通知
curl -X POST -H 'Content-Type: application/json' \
-d '{"text":"Jenkins HA: Switched to BACKUP on '$(hostname)'"}' \
"${SLACK_WEBHOOK_URL}"
4.3 Kubernetes高可用部署
# Kubernetes高可用Jenkins部署
# 文件: jenkins-ha.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: standard
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: jenkins
labels:
app: jenkins
spec:
serviceName: jenkins
replicas: 1
selector:
matchLabels:
app: jenkins
template:
metadata:
labels:
app: jenkins
spec:
serviceAccountName: jenkins
containers:
- name: jenkins
image: jenkins/jenkins:lts
ports:
- containerPort: 8080
name: web
- containerPort: 50000
name: agent
env:
- name: JENKINS_HOME
value: /var/jenkins_home
- name: JAVA_OPTS
value: >-
-Djenkins.install.runSetupWizard=false
-Dhudson.slaves.WorkspaceDir=/var/jenkins_home/workspace
-Xms4g
-Xmx4g
volumeMounts:
- name: jenkins-home
mountPath: /var/jenkins_home
- name: jenkins-config
mountPath: /var/jenkins_config
livenessProbe:
httpGet:
path: /login
port: 8080
initialDelaySeconds: 120
periodSeconds: 10
readinessProbe:
httpGet:
path: /login
port: 8080
initialDelaySeconds: 60
periodSeconds: 5
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "4"
memory: "8Gi"
volumes:
- name: jenkins-home
persistentVolumeClaim:
claimName: jenkins-pvc
- name: jenkins-config
configMap:
name: jenkins-config
---
apiVersion: v1
kind: Service
metadata:
name: jenkins
spec:
selector:
app: jenkins
ports:
- port: 80
targetPort: 8080
name: web
- port: 50000
targetPort: 50000
name: agent
type: LoadBalancer
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: jenkins-pdb
spec:
minAvailable: 1
selector:
matchLabels:
app: jenkins
5. 常见故障诊断
5.1 故障分类
5.2 故障诊断流程
/**
* Jenkins故障诊断脚本
* 在Script Console中运行
*/
import jenkins.model.*
import hudson.model.*
import java.lang.management.*
def diagnose() {
println "=" * 60
println "Jenkins Fault Diagnosis Report"
println "=" * 60
println ""
// ==================== 1. 启动检查 ====================
println "=== Startup Check ==="
def jenkins = Jenkins.getInstance()
if (jenkins == null) {
println "CRITICAL: Jenkins instance is null!"
return
}
println "Jenkins Version: ${jenkins.VERSION}"
println "JENKINS_HOME: ${jenkins.rootDir.absolutePath}"
println "URL: ${jenkins.rootUrl}"
println ""
// ==================== 2. 内存检查 ====================
println "=== Memory Check ==="
def memoryMXBean = ManagementFactory.getMemoryMXBean()
def heapUsage = memoryMXBean.heapMemoryUsage
def heapUsedMB = heapUsage.used / 1024 / 1024
def heapMaxMB = heapUsage.max / 1024 / 1024
def heapPercent = (heapUsage.used * 100 / heapUsage.max).round(2)
println "Heap Used: ${heapUsedMB} MB / ${heapMaxMB} MB (${heapPercent}%)"
if (heapPercent > 85) {
println "WARNING: Heap usage is high!"
}
// GC检查
println ""
println "GC Statistics:"
ManagementFactory.garbageCollectorMXBeans.each { gc ->
println " ${gc.name}: Count=${gc.collectionCount}, Time=${gc.collectionTime}ms"
}
println ""
// ==================== 3. 线程检查 ====================
println "=== Thread Check ==="
def threadMXBean = ManagementFactory.threadMXBean
println "Total Threads: ${threadMXBean.threadCount}"
println "Peak Threads: ${threadMXBean.peakThreadCount}"
println "Daemon Threads: ${threadMXBean.daemonThreadCount}"
// 检查死锁
def deadlockedThreads = threadMXBean.findDeadlockedThreads()
if (deadlockedThreads != null && deadlockedThreads.length > 0) {
println "CRITICAL: Deadlocked threads detected: ${deadlockedThreads.length}"
deadlockedThreads.each { threadId ->
def threadInfo = threadMXBean.getThreadInfo(threadId)
println " Thread ${threadId}: ${threadInfo.threadName}"
println " State: ${threadInfo.threadState}"
println " Lock: ${threadInfo.lockName}"
}
}
println ""
// ==================== 4. 磁盘检查 ====================
println "=== Disk Check ==="
def jenkinsHome = jenkins.rootDir
def totalSpace = jenkinsHome.totalSpace / 1024 / 1024 / 1024
def freeSpace = jenkinsHome.freeSpace / 1024 / 1024 / 1024
def usedSpace = totalSpace - freeSpace
def diskPercent = (usedSpace * 100 / totalSpace).round(2)
println "Total: ${totalSpace.round(2)} GB"
println "Used: ${usedSpace.round(2)} GB"
println "Free: ${freeSpace.round(2)} GB"
println "Usage: ${diskPercent}%"
if (freeSpace < 5) {
println "CRITICAL: Disk space is critically low!"
} else if (freeSpace < 20) {
println "WARNING: Disk space is running low!"
}
println ""
// ==================== 5. 队列检查 ====================
println "=== Queue Check ==="
def queue = jenkins.queue
def queueItems = queue.items
println "Queue Size: ${queueItems.size()}"
if (queueItems.size() > 20) {
println "WARNING: Queue is backing up!"
queueItems.take(10).each { item ->
println " - ${item.task.name} (waiting: ${(System.currentTimeMillis() - item.inQueueSince) / 1000}s)"
}
}
println ""
// ==================== 6. Agent检查 ====================
println "=== Agent Check ==="
def computers = jenkins.computers
def onlineCount = 0
def offlineCount = 0
def offlineAgents = []
computers.each { computer ->
if (computer.offline) {
offlineCount++
offlineAgents << [
name: computer.name,
cause: computer.offlineCause?.toString() ?: "Unknown"
]
} else {
onlineCount++
}
}
println "Online: ${onlineCount}, Offline: ${offlineCount}"
if (offlineCount > 0) {
println "Offline Agents:"
offlineAgents.each { agent ->
println " - ${agent.name}: ${agent.cause}"
}
}
println ""
// ==================== 7. 插件检查 ====================
println "=== Plugin Check ==="
def pluginManager = jenkins.pluginManager
def plugins = pluginManager.plugins
def failedPlugins = plugins.findAll { !it.active }
def outdatedPlugins = plugins.findAll { it.hasUpdate() }
println "Total Plugins: ${plugins.size()}"
println "Failed Plugins: ${failedPlugins.size()}"
println "Outdated Plugins: ${outdatedPlugins.size()}"
if (failedPlugins.size() > 0) {
println "Failed Plugins:"
failedPlugins.each { plugin ->
println " - ${plugin.shortName}: ${plugin.version}"
}
}
println ""
// ==================== 8. 任务检查 ====================
println "=== Job Check ==="
def jobs = jenkins.items
def buildingJobs = jobs.findAll { it instanceof Job && it.lastBuild?.building }
def failedJobs = jobs.findAll { it instanceof Job && it.lastBuild?.result == Result.FAILURE }
println "Total Jobs: ${jobs.size()}"
println "Currently Building: ${buildingJobs.size()}"
println "Last Build Failed: ${failedJobs.size()}"
println ""
// ==================== 9. 诊断建议 ====================
println "=== Recommendations ==="
def recommendations = []
if (heapPercent > 85) {
recommendations << "Increase heap memory or investigate memory leaks"
}
if (deadlockedThreads != null && deadlockedThreads.length > 0) {
recommendations << "Investigate and resolve thread deadlocks"
}
if (freeSpace < 20) {
recommendations << "Clean up old builds and workspaces to free disk space"
}
if (queueItems.size() > 20) {
recommendations << "Add more agents or optimize build duration"
}
if (offlineCount > onlineCount) {
recommendations << "Check agent connectivity and resolve offline issues"
}
if (failedPlugins.size() > 0) {
recommendations << "Fix or remove failed plugins"
}
if (recommendations.isEmpty()) {
println "No critical issues detected."
} else {
recommendations.each { println " - ${it}" }
}
println ""
println "=" * 60
println "Diagnosis Complete"
println "=" * 60
}
diagnose()
6. 故障排查工具
6.1 日志分析工具
#!/bin/bash
# Jenkins日志分析脚本
# ==================== 分析Jenkins日志 ====================
analyze_jenkins_log() {
local log_file="${1:-/var/log/jenkins/jenkins.log}"
echo "=== Jenkins Log Analysis ==="
echo ""
# 统计错误
echo "Error Summary:"
grep -c "ERROR" "${log_file}" 2>/dev/null || echo "0"
# 统计警告
echo ""
echo "Warning Summary:"
grep -c "WARNING" "${log_file}" 2>/dev/null || echo "0"
# 最近错误
echo ""
echo "Recent Errors (last 10):"
grep "ERROR" "${log_file}" | tail -10
# OutOfMemoryError检查
echo ""
echo "OutOfMemoryError Check:"
grep -c "OutOfMemoryError" "${log_file}" 2>/dev/null || echo "0 occurrences"
# 线程问题检查
echo ""
echo "Thread Issues:"
grep -E "(deadlock|blocked|waiting)" "${log_file}" | tail -5
}
# ==================== 分析构建日志 ====================
analyze_build_logs() {
local job_name=$1
local jenkins_home="${JENKINS_HOME:-/var/lib/jenkins}"
echo "=== Build Log Analysis for ${job_name} ==="
echo ""
local job_dir="${jenkins_home}/jobs/${job_name}"
if [ ! -d "${job_dir}" ]; then
echo "Job not found: ${job_name}"
return
fi
# 分析最近的构建
local recent_builds=$(ls -t "${job_dir}/builds" | head -5)
for build in ${recent_builds}; do
local build_dir="${job_dir}/builds/${build}"
local log_file="${build_dir}/log"
if [ -f "${log_file}" ]; then
echo "Build #${build}:"
# 检查构建结果
if grep -q "Finished: SUCCESS" "${log_file}"; then
echo " Result: SUCCESS"
elif grep -q "Finished: FAILURE" "${log_file}"; then
echo " Result: FAILURE"
echo " Error lines:"
grep -E "(ERROR|FAILED|Exception)" "${log_file}" | tail -3
elif grep -q "Finished: ABORTED" "${log_file}"; then
echo " Result: ABORTED"
else
echo " Result: RUNNING or UNKNOWN"
fi
# 构建时长
local duration=$(grep "Total time:" "${log_file}" | tail -1)
if [ -n "${duration}" ]; then
echo " ${duration}"
fi
echo ""
fi
done
}
# ==================== 分析GC日志 ====================
analyze_gc_log() {
local gc_log="${1:-/var/log/jenkins/gc.log}"
if [ ! -f "${gc_log}" ]; then
echo "GC log not found: ${gc_log}"
return
fi
echo "=== GC Log Analysis ==="
echo ""
# 统计GC次数
local gc_count=$(grep -c "GC pause" "${gc_log}" 2>/dev/null || echo "0")
echo "Total GC Count: ${gc_count}"
# Full GC次数
local full_gc_count=$(grep -c "Full GC" "${gc_log}" 2>/dev/null || echo "0")
echo "Full GC Count: ${full_gc_count}"
# 平均暂停时间
echo ""
echo "Pause Time Analysis:"
grep "GC pause" "${gc_log}" | \
grep -oP '\d+\.\d+ ms' | \
awk '{sum+=$1; count++} END {print "Average: " sum/count " ms"}'
# 最大暂停时间
grep "GC pause" "${gc_log}" | \
grep -oP '\d+\.\d+ ms' | \
sort -n | tail -1 | \
awk '{print "Max: " $1}'
}
6.2 线程Dump分析
#!/bin/bash
# 获取Jenkins线程Dump
# ==================== 获取线程Dump ====================
get_thread_dump() {
local pid=$(pgrep -f "jenkins.war")
if [ -z "${pid}" ]; then
echo "Jenkins process not found"
return 1
fi
echo "Jenkins PID: ${pid}"
echo "Getting thread dump..."
# 使用jstack获取线程dump
jstack "${pid}" > /tmp/jenkins_thread_dump_$(date +%Y%m%d_%H%M%S).txt
echo "Thread dump saved to /tmp/jenkins_thread_dump_*.txt"
}
# ==================== 分析线程Dump ====================
analyze_thread_dump() {
local dump_file=$1
echo "=== Thread Dump Analysis ==="
echo ""
# 线程状态统计
echo "Thread State Distribution:"
grep "java.lang.Thread.State" "${dump_file}" | \
sort | uniq -c | sort -rn
# 阻塞线程
echo ""
echo "Blocked Threads:"
grep -B 5 "BLOCKED" "${dump_file}" | grep "^\"" | head -10
# 等待线程
echo ""
echo "Waiting Threads:"
grep -B 5 "WAITING" "${dump_file}" | grep "^\"" | head -10
# CPU密集线程
echo ""
echo "Runnable Threads:"
grep -B 5 "RUNNABLE" "${dump_file}" | grep "^\"" | head -10
}
# ==================== 实时监控 ====================
monitor_threads() {
local pid=$(pgrep -f "jenkins.war")
while true; do
clear
echo "=== Jenkins Thread Monitor ==="
echo "Time: $(date)"
echo ""
# 线程数
echo "Thread Count: $(ps -o nlwp -p ${pid} | tail -1)"
# CPU使用
echo "CPU Usage: $(ps -p ${pid} -o %cpu | tail -1)%"
# 内存使用
echo "Memory Usage: $(ps -p ${pid} -o %mem | tail -1)%"
sleep 5
done
}
7. 灾难恢复计划
7.1 灾难恢复流程
7.2 灾难恢复计划文档
# 灾难恢复计划
# 文件: disaster_recovery_plan.yaml
disaster_recovery:
# 1. 响应团队
response_team:
primary:
- role: "Incident Commander"
name: "Team Lead"
contact: "+1-xxx-xxx-xxxx"
- role: "Technical Lead"
name: "DevOps Engineer"
contact: "+1-xxx-xxx-xxxx"
- role: "Communication Lead"
name: "Project Manager"
contact: "+1-xxx-xxx-xxxx"
escalation:
- level: 1
response_time: "15 minutes"
contacts: ["Primary Team"]
- level: 2
response_time: "30 minutes"
contacts: ["Management", "External Support"]
# 2. 恢复目标
recovery_objectives:
RTO: "4 hours" # 恢复时间目标
RPO: "1 hour" # 恢复点目标
priority_order:
- "Critical: Production deployment jobs"
- "High: CI/CD pipelines"
- "Medium: Development jobs"
- "Low: Historical build data"
# 3. 恢复步骤
recovery_procedures:
partial_failure:
description: "Single component failure"
steps:
- "Identify failed component"
- "Check monitoring alerts"
- "Review recent changes"
- "Apply fix or rollback"
- "Verify functionality"
estimated_time: "30 minutes"
complete_failure:
description: "Jenkins master failure"
steps:
- "Notify response team"
- "Provision new server"
- "Restore from latest backup"
- "Verify configuration"
- "Start Jenkins service"
- "Verify all jobs"
- "Notify users"
estimated_time: "4 hours"
data_center_failure:
description: "Primary data center unavailable"
steps:
- "Activate disaster recovery site"
- "Update DNS to DR site"
- "Restore from offsite backup"
- "Verify all integrations"
- "Notify all stakeholders"
estimated_time: "8 hours"
# 4. 通信计划
communication:
internal:
- channel: "Slack #jenkins-incidents"
template: "incident_notification.md"
- channel: "Email"
recipients: ["dev-team@example.com"]
external:
- channel: "Status Page"
url: "https://status.example.com"
- channel: "Customer Email"
template: "customer_notification.md"
# 5. 验证清单
verification_checklist:
- "Jenkins web UI accessible"
- "All agents connected"
- "Jobs can be triggered"
- "SCM integration working"
- "Notifications working"
- "Plugins loaded correctly"
- "Credentials accessible"
- "Recent builds visible"
# 6. 事后复盘
post_incident:
timeline: "Within 24 hours"
participants: ["Response Team", "Management"]
deliverables:
- "Incident timeline"
- "Root cause analysis"
- "Action items"
- "Updated runbooks"
7.3 恢复验证脚本
/**
* Jenkins恢复验证脚本
*/
def verifyRecovery() {
def jenkins = Jenkins.getInstance()
def issues = []
println "=" * 60
println "Jenkins Recovery Verification"
println "=" * 60
println ""
// ==================== 1. 基础服务检查 ====================
println "1. Basic Service Check..."
try {
def url = new URL("${jenkins.rootUrl}api/json")
def connection = url.openConnection()
connection.requestMethod = "GET"
connection.connectTimeout = 5000
if (connection.responseCode == 200) {
println " [PASS] API is responding"
} else {
issues << "API returned ${connection.responseCode}"
println " [FAIL] API returned ${connection.responseCode}"
}
} catch (Exception e) {
issues << "API check failed: ${e.message}"
println " [FAIL] API check failed: ${e.message}"
}
// ==================== 2. 任务检查 ====================
println "2. Job Check..."
def jobs = jenkins.items
def totalJobs = jobs.size()
def validJobs = jobs.count { it instanceof Job && it.lastBuild != null }
println " Total Jobs: ${totalJobs}"
println " Jobs with builds: ${validJobs}"
if (validJobs < totalJobs * 0.9) {
issues << "Some jobs may be corrupted"
}
// ==================== 3. Agent检查 ====================
println "3. Agent Check..."
def computers = jenkins.computers
def onlineAgents = computers.count { !it.offline }
def totalAgents = computers.size()
println " Total Agents: ${totalAgents}"
println " Online Agents: ${onlineAgents}"
if (onlineAgents < totalAgents * 0.8) {
issues << "Many agents are offline"
}
// ==================== 4. 凭证检查 ====================
println "4. Credential Check..."
try {
def credentialsProvider = jenkins.getExtensionList(
'com.cloudbees.plugins.credentials.SystemCredentialsProvider'
)[0]
def credentials = credentialsProvider.credentials
println " Credentials count: ${credentials.size()}"
println " [PASS] Credentials accessible"
} catch (Exception e) {
issues << "Credentials check failed: ${e.message}"
println " [FAIL] Credentials check failed"
}
// ==================== 5. 插件检查 ====================
println "5. Plugin Check..."
def pluginManager = jenkins.pluginManager
def failedPlugins = pluginManager.plugins.findAll { !it.active }
if (failedPlugins.isEmpty()) {
println " [PASS] All plugins active"
} else {
issues << "Failed plugins: ${failedPlugins.collect { it.shortName }}"
println " [FAIL] ${failedPlugins.size()} plugins failed"
}
// ==================== 6. SCM检查 ====================
println "6. SCM Integration Check..."
// 检查Git插件
try {
def gitSCM = Class.forName('hudson.plugins.git.GitSCM')
println " [PASS] Git plugin loaded"
} catch (ClassNotFoundException e) {
issues << "Git plugin not loaded"
println " [FAIL] Git plugin not loaded"
}
// ==================== 7. 队列检查 ====================
println "7. Queue Check..."
def queue = jenkins.queue
if (queue.items.size() < 50) {
println " [PASS] Queue size normal: ${queue.items.size()}"
} else {
issues << "Queue is backing up: ${queue.items.size()} items"
println " [WARN] Queue size: ${queue.items.size()}"
}
// ==================== 8. 磁盘检查 ====================
println "8. Disk Check..."
def jenkinsHome = jenkins.rootDir
def freeSpace = jenkinsHome.freeSpace / 1024 / 1024 / 1024
if (freeSpace > 10) {
println " [PASS] Free space: ${freeSpace.round(2)} GB"
} else {
issues << "Low disk space: ${freeSpace.round(2)} GB"
println " [WARN] Free space: ${freeSpace.round(2)} GB"
}
// ==================== 结果汇总 ====================
println ""
println "=" * 60
println "Verification Summary"
println "=" * 60
if (issues.isEmpty()) {
println "All checks passed. Recovery successful!"
} else {
println "Issues found:"
issues.each { println " - ${it}" }
println ""
println "Action required: Review and resolve the above issues"
}
}
verifyRecovery()
总结
本文深入讲解了Jenkins备份恢复与故障排查,涵盖以下关键知识点:
| 主题 | 核心要点 |
|---|---|
| 备份策略 | 数据分类、备份类型、保留策略 |
| 备份实施 | ThinBackup、自定义脚本、定时配置 |
| 恢复流程 | 完整恢复、部分恢复、验证流程 |
| 高可用 | 主备架构、Kubernetes部署、自动切换 |
| 故障诊断 | 故障分类、诊断脚本、分析流程 |
| 排查工具 | 日志分析、线程Dump、监控工具 |
| 灾难恢复 | 恢复计划、通信流程、验证清单 |
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐

所有评论(0)