Jenkins备份恢复与故障排查:构建可靠的CI/CD基础设施

1. 备份策略设计

1.1 备份架构

备份策略架构

备份计划

每日增量

每周完整

每月归档

数据类型

配置文件
config.xml

任务定义
jobs/

构建历史
builds/

凭证数据
credentials.xml

插件配置
plugins/

用户数据
users/

备份类型

完整备份

增量备份

差异备份

存储位置

本地存储

网络存储
NFS/S3

异地备份

1.2 备份范围规划

数据类型 重要性 备份频率 保留周期 恢复优先级
config.xml 每次变更 永久 P0
credentials.xml 每次变更 90天 P0
jobs/ 每日增量 30天 P1
plugins/ 每周 90天 P2
builds/ 每日增量 30天 P3
users/ 每日 90天 P2
workspace/ 不备份 - P4

1.3 备份策略矩阵

# 备份策略配置

backup_strategy:
  
  # 关键配置备份
  critical_config:
    frequency: "on_change"  # 变更时备份
    retention: "permanent"  # 永久保留
    encryption: true
    components:
      - "config.xml"
      - "credentials.xml"
      - "jenkins.model.JenkinsLocationConfiguration.xml"
      - "hudson.model.UpdateCenter.xml"
  
  # 任务定义备份
  job_definitions:
    frequency: "hourly"
    retention: "30_days"
    encryption: true
    components:
      - "jobs/*/config.xml"
      - "jobs/*/nextBuildNumber"
  
  # 构建历史备份
  build_history:
    frequency: "daily"
    retention: "14_days"
    encryption: false
    components:
      - "jobs/*/builds/*/build.xml"
      - "jobs/*/builds/*/log"
      - "jobs/*/builds/*/changelog.xml"
  
  # 插件备份
  plugins:
    frequency: "weekly"
    retention: "90_days"
    encryption: false
    components:
      - "plugins/*.jpi"
      - "plugins/*.jpi.pinned"
      - "plugins/*.jpi.disabled"
  
  # 完整备份
  full_backup:
    frequency: "weekly"
    retention: "90_days"
    encryption: true
    compression: true
    components:
      - "entire JENKINS_HOME"

2. 备份方案实施

2.1 使用ThinBackup插件

/**
 * ThinBackup插件配置
 * 通过JCasC配置
 */
jenkins:
  thinBackup:
    # 备份目录
    backupPath: "/backup/jenkins"
    
    # 完整备份计划
    fullBackupSchedule: "H 2 * * 0"  # 每周日凌晨2点
    
    # 增量备份计划
    differentialBackupSchedule: "H 2 * * 1-6"  # 周一到周六凌晨2点
    
    # 备份保留策略
    backupMaxDaysInStorageAge: 90  # 保留90天
    backupMaxDaysInStorageAgeFull: 180  # 完整备份保留180天
    nrMaxStoredFull: 10  # 最多保留10个完整备份
    
    # 备份内容
    backupBuildResults: true
    backupBuildArchive: true
    backupConfigFiles: true
    backupGlobalConfiguration: true
    backupUserContents: false
    backupNextBuildNumber: true
    
    # 其他选项
    quietMode: true  # 备份时进入静默模式
    moveOldBackupsToZip: true  # 压缩旧备份
    cleanupDisabled: false

2.2 自定义备份脚本

#!/bin/bash
# Jenkins备份脚本
# 文件: /opt/jenkins/scripts/backup.sh

set -e

# ==================== 配置 ====================
JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}"
BACKUP_DIR="${BACKUP_DIR:-/backup/jenkins}"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_NAME="jenkins_backup_${DATE}"
BACKUP_PATH="${BACKUP_DIR}/${BACKUP_NAME}"
RETENTION_DAYS=30
S3_BUCKET="s3://jenkins-backups"

# 日志函数
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# ==================== 准备备份 ====================
prepare_backup() {
    log "Preparing backup..."
    
    # 创建备份目录
    mkdir -p "${BACKUP_PATH}"
    
    # 进入静默模式(可选)
    # curl -X POST "http://localhost:8080/quietDown"
    
    # 等待正在进行的构建完成(可选)
    # wait_for_builds
}

# ==================== 备份配置 ====================
backup_config() {
    log "Backing up configuration..."
    
    # 核心配置文件
    cp "${JENKINS_HOME}/config.xml" "${BACKUP_PATH}/"
    cp "${JENKINS_HOME}/credentials.xml" "${BACKUP_PATH}/" 2>/dev/null || true
    cp "${JENKINS_HOME}/hudson.model.UpdateCenter.xml" "${BACKUP_PATH}/" 2>/dev/null || true
    cp "${JENKINS_HOME}/jenkins.model.JenkinsLocationConfiguration.xml" "${BACKUP_PATH}/" 2>/dev/null || true
    
    # 备份密钥(重要!)
    cp -r "${JENKINS_HOME}/secrets" "${BACKUP_PATH}/" 2>/dev/null || true
    
    # 用户配置
    cp -r "${JENKINS_HOME}/users" "${BACKUP_PATH}/" 2>/dev/null || true
}

# ==================== 备份任务 ====================
backup_jobs() {
    log "Backing up jobs..."
    
    mkdir -p "${BACKUP_PATH}/jobs"
    
    # 备份所有任务配置
    find "${JENKINS_HOME}/jobs" -name "config.xml" | while read config; do
        job_dir=$(dirname "${config}")
        job_name=$(basename "${job_dir}")
        
        # 创建任务备份目录
        mkdir -p "${BACKUP_PATH}/jobs/${job_name}"
        
        # 备份配置和构建号
        cp "${config}" "${BACKUP_PATH}/jobs/${job_name}/"
        cp "${job_dir}/nextBuildNumber" "${BACKUP_PATH}/jobs/${job_name}/" 2>/dev/null || true
        
        # 备份最近的构建历史(可选)
        if [ "${BACKUP_BUILDS:-false}" = "true" ]; then
            if [ -d "${job_dir}/builds" ]; then
                # 只备份最近10个构建
                recent_builds=$(ls -t "${job_dir}/builds" | head -10)
                for build in ${recent_builds}; do
                    mkdir -p "${BACKUP_PATH}/jobs/${job_name}/builds/${build}"
                    cp "${job_dir}/builds/${build}/build.xml" "${BACKUP_PATH}/jobs/${job_name}/builds/${build}/" 2>/dev/null || true
                    cp "${job_dir}/builds/${build}/log" "${BACKUP_PATH}/jobs/${job_name}/builds/${build}/" 2>/dev/null || true
                done
            fi
        fi
    done
}

# ==================== 备份插件 ====================
backup_plugins() {
    log "Backing up plugins..."
    
    mkdir -p "${BACKUP_PATH}/plugins"
    
    # 备份插件列表
    ls "${JENKINS_HOME}/plugins" > "${BACKUP_PATH}/plugins/plugin_list.txt"
    
    # 备份插件文件(可选,较大)
    if [ "${BACKUP_PLUGINS:-false}" = "true" ]; then
        cp "${JENKINS_HOME}/plugins/"*.jpi "${BACKUP_PATH}/plugins/" 2>/dev/null || true
        cp "${JENKINS_HOME}/plugins/"*.jpi.pinned "${BACKUP_PATH}/plugins/" 2>/dev/null || true
    fi
}

# ==================== 备份视图 ====================
backup_views() {
    log "Backing up views..."
    
    # 备份视图配置
    find "${JENKINS_HOME}" -maxdepth 1 -name "*.xml" -type f | while read xml; do
        cp "${xml}" "${BACKUP_PATH}/"
    done
}

# ==================== 创建备份元数据 ====================
create_metadata() {
    log "Creating backup metadata..."
    
    cat > "${BACKUP_PATH}/backup_metadata.json" << EOF
{
    "backup_date": "$(date -Iseconds)",
    "jenkins_version": "$(cat ${JENKINS_HOME}/config.xml | grep -oP '(?<=<version>)[^<]+')",
    "backup_type": "${BACKUP_TYPE:-full}",
    "hostname": "$(hostname)",
    "jenkins_home": "${JENKINS_HOME}",
    "backup_size": "$(du -sh ${BACKUP_PATH} | cut -f1)",
    "checksum": "$(find ${BACKUP_PATH} -type f -exec sha256sum {} \; | sha256sum | cut -d' ' -f1)"
}
EOF
}

# ==================== 压缩备份 ====================
compress_backup() {
    log "Compressing backup..."
    
    cd "${BACKUP_DIR}"
    tar -czf "${BACKUP_NAME}.tar.gz" "${BACKUP_NAME}"
    rm -rf "${BACKUP_NAME}"
    
    log "Backup created: ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
}

# ==================== 加密备份 ====================
encrypt_backup() {
    if [ "${ENCRYPT_BACKUP:-false}" = "true" ]; then
        log "Encrypting backup..."
        
        gpg --symmetric --cipher-algo AES256 \
            --passphrase-file "${ENCRYPTION_KEY_FILE}" \
            "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
        
        rm "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
        mv "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg" "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg"
        
        log "Backup encrypted: ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz.gpg"
    fi
}

# ==================== 上传到远程存储 ====================
upload_to_remote() {
    log "Uploading to remote storage..."
    
    # 上传到S3
    if command -v aws &> /dev/null; then
        aws s3 cp "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"* "${S3_BUCKET}/$(date +%Y/%m/%d)/"
        log "Uploaded to S3: ${S3_BUCKET}/$(date +%Y/%m/%d)/"
    fi
    
    # 或使用rsync
    # rsync -avz "${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"* backup-server:/backup/jenkins/
}

# ==================== 清理旧备份 ====================
cleanup_old_backups() {
    log "Cleaning up old backups..."
    
    # 删除本地旧备份
    find "${BACKUP_DIR}" -name "jenkins_backup_*.tar.gz*" -mtime +${RETENTION_DAYS} -delete
    
    # 删除S3旧备份
    if command -v aws &> /dev/null; then
        aws s3 ls "${S3_BUCKET}/" --recursive | \
            awk '$1 < "'$(date -d "-${RETENTION_DAYS} days" +%Y-%m-%d)'" {print $4}' | \
            while read key; do
                aws s3 rm "${S3_BUCKET}/${key}"
            done
    fi
}

# ==================== 验证备份 ====================
verify_backup() {
    log "Verifying backup..."
    
    local backup_file="${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
    
    if [ -f "${backup_file}" ]; then
        # 检查压缩文件完整性
        if tar -tzf "${backup_file}" > /dev/null 2>&1; then
            log "Backup verification: PASSED"
            return 0
        else
            log "Backup verification: FAILED"
            return 1
        fi
    else
        log "Backup file not found: ${backup_file}"
        return 1
    fi
}

# ==================== 主函数 ====================
main() {
    log "Starting Jenkins backup..."
    
    prepare_backup
    backup_config
    backup_jobs
    backup_plugins
    backup_views
    create_metadata
    compress_backup
    encrypt_backup
    upload_to_remote
    cleanup_old_backups
    verify_backup
    
    log "Backup completed successfully!"
}

# 执行备份
main "$@"

2.3 定时备份配置

# crontab配置
# 文件: /etc/cron.d/jenkins-backup

# 每日凌晨2点执行增量备份
0 2 * * * jenkins BACKUP_TYPE=incremental /opt/jenkins/scripts/backup.sh >> /var/log/jenkins/backup.log 2>&1

# 每周日凌晨3点执行完整备份
0 3 * * 0 jenkins BACKUP_TYPE=full BACKUP_BUILDS=true BACKUP_PLUGINS=true /opt/jenkins/scripts/backup.sh >> /var/log/jenkins/backup.log 2>&1

# 每小时备份关键配置
0 * * * * jenkins BACKUP_TYPE=critical /opt/jenkins/scripts/backup-critical.sh >> /var/log/jenkins/backup-critical.log 2>&1

3. 恢复流程详解

3.1 恢复流程架构

恢复流程

配置损坏

数据丢失

完全故障

故障发生

评估影响

配置恢复

数据恢复

完整恢复

停止Jenkins

恢复备份

验证数据

启动Jenkins

验证功能

恢复完成

3.2 恢复脚本

#!/bin/bash
# Jenkins恢复脚本
# 文件: /opt/jenkins/scripts/restore.sh

set -e

# ==================== 配置 ====================
JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}"
BACKUP_DIR="${BACKUP_DIR:-/backup/jenkins}"
RESTORE_LOG="/var/log/jenkins/restore.log"

# 日志函数
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "${RESTORE_LOG}"
}

# ==================== 检查备份文件 ====================
check_backup() {
    local backup_file=$1
    
    if [ ! -f "${backup_file}" ]; then
        log "ERROR: Backup file not found: ${backup_file}"
        exit 1
    fi
    
    # 验证备份完整性
    log "Verifying backup integrity..."
    if ! tar -tzf "${backup_file}" > /dev/null 2>&1; then
        log "ERROR: Backup file is corrupted"
        exit 1
    fi
    
    log "Backup verification passed"
}

# ==================== 停止Jenkins ====================
stop_jenkins() {
    log "Stopping Jenkins..."
    
    if systemctl is-active --quiet jenkins; then
        systemctl stop jenkins
        log "Jenkins stopped"
    else
        log "Jenkins is not running"
    fi
}

# ==================== 备份当前状态 ====================
backup_current_state() {
    log "Backing up current state..."
    
    local current_backup="${BACKUP_DIR}/pre_restore_$(date +%Y%m%d_%H%M%S).tar.gz"
    tar -czf "${current_backup}" -C "${JENKINS_HOME}" .
    log "Current state backed up to: ${current_backup}"
}

# ==================== 恢复备份 ====================
restore_backup() {
    local backup_file=$1
    
    log "Restoring from backup: ${backup_file}"
    
    # 解密(如果需要)
    if [[ "${backup_file}" == *.gpg ]]; then
        log "Decrypting backup..."
        gpg --decrypt --passphrase-file "${ENCRYPTION_KEY_FILE}" "${backup_file}" > "${backup_file%.gpg}"
        backup_file="${backup_file%.gpg}"
    fi
    
    # 清空JENKINS_HOME
    log "Clearing JENKINS_HOME..."
    rm -rf "${JENKINS_HOME:?}"/*
    
    # 解压备份
    log "Extracting backup..."
    tar -xzf "${backup_file}" -C "${JENKINS_HOME}"
    
    # 设置权限
    log "Setting permissions..."
    chown -R jenkins:jenkins "${JENKINS_HOME}"
    chmod -R 755 "${JENKINS_HOME}"
    chmod 700 "${JENKINS_HOME}/secrets"
    
    log "Restore completed"
}

# ==================== 验证恢复 ====================
verify_restore() {
    log "Verifying restore..."
    
    # 检查关键文件
    local required_files=(
        "${JENKINS_HOME}/config.xml"
        "${JENKINS_HOME}/secrets/master.key"
        "${JENKINS_HOME}/secrets/hudson.util.Secret"
    )
    
    for file in "${required_files[@]}"; do
        if [ ! -f "${file}" ]; then
            log "ERROR: Required file missing: ${file}"
            return 1
        fi
    done
    
    log "Restore verification passed"
}

# ==================== 启动Jenkins ====================
start_jenkins() {
    log "Starting Jenkins..."
    
    systemctl start jenkins
    
    # 等待Jenkins启动
    local max_wait=120
    local wait=0
    
    while [ ${wait} -lt ${max_wait} ]; do
        if curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/login" | grep -q "200"; then
            log "Jenkins is up and running"
            return 0
        fi
        
        sleep 5
        wait=$((wait + 5))
        log "Waiting for Jenkins to start... (${wait}s)"
    done
    
    log "ERROR: Jenkins failed to start within ${max_wait}s"
    return 1
}

# ==================== 验证功能 ====================
verify_functionality() {
    log "Verifying Jenkins functionality..."
    
    # 检查API响应
    local api_response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8080/api/json")
    
    if [ "${api_response}" = "200" ]; then
        log "API check: PASSED"
    else
        log "WARNING: API check failed with status ${api_response}"
    fi
    
    # 检查任务数量
    local job_count=$(curl -s "http://localhost:8080/api/json?tree=jobs[name]" | jq '.jobs | length')
    log "Jobs restored: ${job_count}"
    
    # 检查插件状态
    local plugin_count=$(curl -s "http://localhost:8080/pluginManager/api/json?depth=1" | jq '.plugins | length')
    log "Plugins loaded: ${plugin_count}"
}

# ==================== 主函数 ====================
main() {
    local backup_file=$1
    
    if [ -z "${backup_file}" ]; then
        # 列出可用备份
        log "Available backups:"
        ls -lt "${BACKUP_DIR}"/jenkins_backup_*.tar.gz* 2>/dev/null | head -10
        echo ""
        read -p "Enter backup file to restore: " backup_file
    fi
    
    log "Starting Jenkins restore from: ${backup_file}"
    
    check_backup "${backup_file}"
    stop_jenkins
    backup_current_state
    restore_backup "${backup_file}"
    verify_restore
    start_jenkins
    verify_functionality
    
    log "Restore completed successfully!"
}

# 执行恢复
main "$@"

3.3 部分恢复场景

#!/bin/bash
# Jenkins部分恢复脚本

# ==================== 恢复单个任务 ====================
restore_single_job() {
    local job_name=$1
    local backup_file=$2
    
    log "Restoring job: ${job_name}"
    
    # 创建临时目录
    local temp_dir=$(mktemp -d)
    
    # 解压备份
    tar -xzf "${backup_file}" -C "${temp_dir}"
    
    # 恢复任务配置
    if [ -d "${temp_dir}/jobs/${job_name}" ]; then
        # 备份当前配置
        if [ -d "${JENKINS_HOME}/jobs/${job_name}" ]; then
            mv "${JENKINS_HOME}/jobs/${job_name}" "${JENKINS_HOME}/jobs/${job_name}.bak"
        fi
        
        # 恢复配置
        mkdir -p "${JENKINS_HOME}/jobs/${job_name}"
        cp -r "${temp_dir}/jobs/${job_name}/config.xml" "${JENKINS_HOME}/jobs/${job_name}/"
        
        # 重新加载任务
        curl -X POST "http://localhost:8080/reload"
        
        log "Job ${job_name} restored successfully"
    else
        log "ERROR: Job ${job_name} not found in backup"
    fi
    
    # 清理临时目录
    rm -rf "${temp_dir}"
}

# ==================== 恢复凭证 ====================
restore_credentials() {
    local backup_file=$1
    
    log "Restoring credentials..."
    
    # 停止Jenkins
    systemctl stop jenkins
    
    # 创建临时目录
    local temp_dir=$(mktemp -d)
    
    # 解压备份
    tar -xzf "${backup_file}" -C "${temp_dir}"
    
    # 恢复凭证文件
    if [ -f "${temp_dir}/credentials.xml" ]; then
        cp "${temp_dir}/credentials.xml" "${JENKINS_HOME}/"
        chown jenkins:jenkins "${JENKINS_HOME}/credentials.xml"
        chmod 600 "${JENKINS_HOME}/credentials.xml"
    fi
    
    # 恢复密钥
    if [ -d "${temp_dir}/secrets" ]; then
        cp -r "${temp_dir}/secrets" "${JENKINS_HOME}/"
        chown -R jenkins:jenkins "${JENKINS_HOME}/secrets"
        chmod -R 700 "${JENKINS_HOME}/secrets"
    fi
    
    # 清理并启动
    rm -rf "${temp_dir}"
    systemctl start jenkins
    
    log "Credentials restored"
}

# ==================== 恢复插件 ====================
restore_plugins() {
    local backup_file=$1
    
    log "Restoring plugins..."
    
    # 创建临时目录
    local temp_dir=$(mktemp -d)
    
    # 解压备份
    tar -xzf "${backup_file}" -C "${temp_dir}"
    
    # 恢复插件
    if [ -d "${temp_dir}/plugins" ]; then
        cp -r "${temp_dir}/plugins/"*.jpi "${JENKINS_HOME}/plugins/" 2>/dev/null || true
        cp -r "${temp_dir}/plugins/"*.jpi.pinned "${JENKINS_HOME}/plugins/" 2>/dev/null || true
        chown -R jenkins:jenkins "${JENKINS_HOME}/plugins"
    fi
    
    # 清理
    rm -rf "${temp_dir}"
    
    # 重启Jenkins
    systemctl restart jenkins
    
    log "Plugins restored"
}

4. 高可用架构

4.1 高可用架构设计

Jenkins高可用架构

Agent池

共享存储

Master集群

负载均衡层

Agent 1

HAProxy/Nginx

Master 1
Active

Master 2
Standby

NFS/GlusterFS
JENKINS_HOME

数据库
PostgreSQL

Agent 2

Agent N

健康检查

4.2 主备切换配置

# Keepalived配置
# 文件: /etc/keepalived/keepalived.conf

vrrp_script check_jenkins {
    script "/usr/local/bin/check_jenkins.sh"
    interval 5
    fall 3
    rise 2
}

vrrp_instance VI_1 {
    state MASTER  # 备节点设为BACKUP
    interface eth0
    virtual_router_id 51
    priority 100  # 备节点设为90
    advert_int 1
    
    authentication {
        auth_type PASS
        auth_pass jenkins-ha
    }
    
    virtual_ipaddress {
        192.168.1.100/24
    }
    
    track_script {
        check_jenkins
    }
    
    notify_master "/usr/local/bin/jenkins_master.sh"
    notify_backup "/usr/local/bin/jenkins_backup.sh"
}

---
# 健康检查脚本
# 文件: /usr/local/bin/check_jenkins.sh

#!/bin/bash
# 检查Jenkins是否健康

JENKINS_URL="http://localhost:8080"

# 检查进程
if ! pgrep -f "jenkins.war" > /dev/null; then
    exit 1
fi

# 检查HTTP响应
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${JENKINS_URL}/login")
if [ "${HTTP_CODE}" != "200" ]; then
    exit 1
fi

exit 0

---
# 切换为Master脚本
# 文件: /usr/local/bin/jenkins_master.sh

#!/bin/bash
# 切换为Master节点

logger "Jenkins HA: Switching to MASTER"

# 启动Jenkins(如果未运行)
if ! systemctl is-active --quiet jenkins; then
    systemctl start jenkins
fi

# 挂载共享存储
mount -t nfs nfs-server:/jenkins /var/lib/jenkins

# 发送通知
curl -X POST -H 'Content-Type: application/json' \
    -d '{"text":"Jenkins HA: Switched to MASTER on '$(hostname)'"}' \
    "${SLACK_WEBHOOK_URL}"

---
# 切换为Backup脚本
# 文件: /usr/local/bin/jenkins_backup.sh

#!/bin/bash
# 切换为Backup节点

logger "Jenkins HA: Switching to BACKUP"

# 停止Jenkins
systemctl stop jenkins

# 卸载共享存储
umount /var/lib/jenkins

# 发送通知
curl -X POST -H 'Content-Type: application/json' \
    -d '{"text":"Jenkins HA: Switched to BACKUP on '$(hostname)'"}' \
    "${SLACK_WEBHOOK_URL}"

4.3 Kubernetes高可用部署

# Kubernetes高可用Jenkins部署
# 文件: jenkins-ha.yaml

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: jenkins-pvc
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 100Gi
  storageClassName: standard

---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: jenkins
  labels:
    app: jenkins
spec:
  serviceName: jenkins
  replicas: 1
  selector:
    matchLabels:
      app: jenkins
  template:
    metadata:
      labels:
        app: jenkins
    spec:
      serviceAccountName: jenkins
      containers:
      - name: jenkins
        image: jenkins/jenkins:lts
        ports:
        - containerPort: 8080
          name: web
        - containerPort: 50000
          name: agent
        env:
        - name: JENKINS_HOME
          value: /var/jenkins_home
        - name: JAVA_OPTS
          value: >-
            -Djenkins.install.runSetupWizard=false
            -Dhudson.slaves.WorkspaceDir=/var/jenkins_home/workspace
            -Xms4g
            -Xmx4g
        volumeMounts:
        - name: jenkins-home
          mountPath: /var/jenkins_home
        - name: jenkins-config
          mountPath: /var/jenkins_config
        livenessProbe:
          httpGet:
            path: /login
            port: 8080
          initialDelaySeconds: 120
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /login
            port: 8080
          initialDelaySeconds: 60
          periodSeconds: 5
        resources:
          requests:
            cpu: "2"
            memory: "4Gi"
          limits:
            cpu: "4"
            memory: "8Gi"
      volumes:
      - name: jenkins-home
        persistentVolumeClaim:
          claimName: jenkins-pvc
      - name: jenkins-config
        configMap:
          name: jenkins-config

---
apiVersion: v1
kind: Service
metadata:
  name: jenkins
spec:
  selector:
    app: jenkins
  ports:
  - port: 80
    targetPort: 8080
    name: web
  - port: 50000
    targetPort: 50000
    name: agent
  type: LoadBalancer

---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
  name: jenkins-pdb
spec:
  minAvailable: 1
  selector:
    matchLabels:
      app: jenkins

5. 常见故障诊断

5.1 故障分类

故障类型

数据故障

配置损坏

构建丢失

凭证失效

网络故障

Agent连接失败

Webhook失败

SCM连接超时

运行时故障

内存溢出

线程死锁

磁盘满

启动故障

服务无法启动

端口占用

权限问题

5.2 故障诊断流程

/**
 * Jenkins故障诊断脚本
 * 在Script Console中运行
 */

import jenkins.model.*
import hudson.model.*
import java.lang.management.*

def diagnose() {
    println "=" * 60
    println "Jenkins Fault Diagnosis Report"
    println "=" * 60
    println ""
    
    // ==================== 1. 启动检查 ====================
    println "=== Startup Check ==="
    
    def jenkins = Jenkins.getInstance()
    
    if (jenkins == null) {
        println "CRITICAL: Jenkins instance is null!"
        return
    }
    
    println "Jenkins Version: ${jenkins.VERSION}"
    println "JENKINS_HOME: ${jenkins.rootDir.absolutePath}"
    println "URL: ${jenkins.rootUrl}"
    println ""
    
    // ==================== 2. 内存检查 ====================
    println "=== Memory Check ==="
    
    def memoryMXBean = ManagementFactory.getMemoryMXBean()
    def heapUsage = memoryMXBean.heapMemoryUsage
    
    def heapUsedMB = heapUsage.used / 1024 / 1024
    def heapMaxMB = heapUsage.max / 1024 / 1024
    def heapPercent = (heapUsage.used * 100 / heapUsage.max).round(2)
    
    println "Heap Used: ${heapUsedMB} MB / ${heapMaxMB} MB (${heapPercent}%)"
    
    if (heapPercent > 85) {
        println "WARNING: Heap usage is high!"
    }
    
    // GC检查
    println ""
    println "GC Statistics:"
    ManagementFactory.garbageCollectorMXBeans.each { gc ->
        println "  ${gc.name}: Count=${gc.collectionCount}, Time=${gc.collectionTime}ms"
    }
    println ""
    
    // ==================== 3. 线程检查 ====================
    println "=== Thread Check ==="
    
    def threadMXBean = ManagementFactory.threadMXBean
    println "Total Threads: ${threadMXBean.threadCount}"
    println "Peak Threads: ${threadMXBean.peakThreadCount}"
    println "Daemon Threads: ${threadMXBean.daemonThreadCount}"
    
    // 检查死锁
    def deadlockedThreads = threadMXBean.findDeadlockedThreads()
    if (deadlockedThreads != null && deadlockedThreads.length > 0) {
        println "CRITICAL: Deadlocked threads detected: ${deadlockedThreads.length}"
        deadlockedThreads.each { threadId ->
            def threadInfo = threadMXBean.getThreadInfo(threadId)
            println "  Thread ${threadId}: ${threadInfo.threadName}"
            println "    State: ${threadInfo.threadState}"
            println "    Lock: ${threadInfo.lockName}"
        }
    }
    println ""
    
    // ==================== 4. 磁盘检查 ====================
    println "=== Disk Check ==="
    
    def jenkinsHome = jenkins.rootDir
    def totalSpace = jenkinsHome.totalSpace / 1024 / 1024 / 1024
    def freeSpace = jenkinsHome.freeSpace / 1024 / 1024 / 1024
    def usedSpace = totalSpace - freeSpace
    def diskPercent = (usedSpace * 100 / totalSpace).round(2)
    
    println "Total: ${totalSpace.round(2)} GB"
    println "Used: ${usedSpace.round(2)} GB"
    println "Free: ${freeSpace.round(2)} GB"
    println "Usage: ${diskPercent}%"
    
    if (freeSpace < 5) {
        println "CRITICAL: Disk space is critically low!"
    } else if (freeSpace < 20) {
        println "WARNING: Disk space is running low!"
    }
    println ""
    
    // ==================== 5. 队列检查 ====================
    println "=== Queue Check ==="
    
    def queue = jenkins.queue
    def queueItems = queue.items
    
    println "Queue Size: ${queueItems.size()}"
    
    if (queueItems.size() > 20) {
        println "WARNING: Queue is backing up!"
        
        queueItems.take(10).each { item ->
            println "  - ${item.task.name} (waiting: ${(System.currentTimeMillis() - item.inQueueSince) / 1000}s)"
        }
    }
    println ""
    
    // ==================== 6. Agent检查 ====================
    println "=== Agent Check ==="
    
    def computers = jenkins.computers
    def onlineCount = 0
    def offlineCount = 0
    def offlineAgents = []
    
    computers.each { computer ->
        if (computer.offline) {
            offlineCount++
            offlineAgents << [
                name: computer.name,
                cause: computer.offlineCause?.toString() ?: "Unknown"
            ]
        } else {
            onlineCount++
        }
    }
    
    println "Online: ${onlineCount}, Offline: ${offlineCount}"
    
    if (offlineCount > 0) {
        println "Offline Agents:"
        offlineAgents.each { agent ->
            println "  - ${agent.name}: ${agent.cause}"
        }
    }
    println ""
    
    // ==================== 7. 插件检查 ====================
    println "=== Plugin Check ==="
    
    def pluginManager = jenkins.pluginManager
    def plugins = pluginManager.plugins
    
    def failedPlugins = plugins.findAll { !it.active }
    def outdatedPlugins = plugins.findAll { it.hasUpdate() }
    
    println "Total Plugins: ${plugins.size()}"
    println "Failed Plugins: ${failedPlugins.size()}"
    println "Outdated Plugins: ${outdatedPlugins.size()}"
    
    if (failedPlugins.size() > 0) {
        println "Failed Plugins:"
        failedPlugins.each { plugin ->
            println "  - ${plugin.shortName}: ${plugin.version}"
        }
    }
    println ""
    
    // ==================== 8. 任务检查 ====================
    println "=== Job Check ==="
    
    def jobs = jenkins.items
    def buildingJobs = jobs.findAll { it instanceof Job && it.lastBuild?.building }
    def failedJobs = jobs.findAll { it instanceof Job && it.lastBuild?.result == Result.FAILURE }
    
    println "Total Jobs: ${jobs.size()}"
    println "Currently Building: ${buildingJobs.size()}"
    println "Last Build Failed: ${failedJobs.size()}"
    println ""
    
    // ==================== 9. 诊断建议 ====================
    println "=== Recommendations ==="
    
    def recommendations = []
    
    if (heapPercent > 85) {
        recommendations << "Increase heap memory or investigate memory leaks"
    }
    
    if (deadlockedThreads != null && deadlockedThreads.length > 0) {
        recommendations << "Investigate and resolve thread deadlocks"
    }
    
    if (freeSpace < 20) {
        recommendations << "Clean up old builds and workspaces to free disk space"
    }
    
    if (queueItems.size() > 20) {
        recommendations << "Add more agents or optimize build duration"
    }
    
    if (offlineCount > onlineCount) {
        recommendations << "Check agent connectivity and resolve offline issues"
    }
    
    if (failedPlugins.size() > 0) {
        recommendations << "Fix or remove failed plugins"
    }
    
    if (recommendations.isEmpty()) {
        println "No critical issues detected."
    } else {
        recommendations.each { println "  - ${it}" }
    }
    
    println ""
    println "=" * 60
    println "Diagnosis Complete"
    println "=" * 60
}

diagnose()

6. 故障排查工具

6.1 日志分析工具

#!/bin/bash
# Jenkins日志分析脚本

# ==================== 分析Jenkins日志 ====================
analyze_jenkins_log() {
    local log_file="${1:-/var/log/jenkins/jenkins.log}"
    
    echo "=== Jenkins Log Analysis ==="
    echo ""
    
    # 统计错误
    echo "Error Summary:"
    grep -c "ERROR" "${log_file}" 2>/dev/null || echo "0"
    
    # 统计警告
    echo ""
    echo "Warning Summary:"
    grep -c "WARNING" "${log_file}" 2>/dev/null || echo "0"
    
    # 最近错误
    echo ""
    echo "Recent Errors (last 10):"
    grep "ERROR" "${log_file}" | tail -10
    
    # OutOfMemoryError检查
    echo ""
    echo "OutOfMemoryError Check:"
    grep -c "OutOfMemoryError" "${log_file}" 2>/dev/null || echo "0 occurrences"
    
    # 线程问题检查
    echo ""
    echo "Thread Issues:"
    grep -E "(deadlock|blocked|waiting)" "${log_file}" | tail -5
}

# ==================== 分析构建日志 ====================
analyze_build_logs() {
    local job_name=$1
    local jenkins_home="${JENKINS_HOME:-/var/lib/jenkins}"
    
    echo "=== Build Log Analysis for ${job_name} ==="
    echo ""
    
    local job_dir="${jenkins_home}/jobs/${job_name}"
    
    if [ ! -d "${job_dir}" ]; then
        echo "Job not found: ${job_name}"
        return
    fi
    
    # 分析最近的构建
    local recent_builds=$(ls -t "${job_dir}/builds" | head -5)
    
    for build in ${recent_builds}; do
        local build_dir="${job_dir}/builds/${build}"
        local log_file="${build_dir}/log"
        
        if [ -f "${log_file}" ]; then
            echo "Build #${build}:"
            
            # 检查构建结果
            if grep -q "Finished: SUCCESS" "${log_file}"; then
                echo "  Result: SUCCESS"
            elif grep -q "Finished: FAILURE" "${log_file}"; then
                echo "  Result: FAILURE"
                echo "  Error lines:"
                grep -E "(ERROR|FAILED|Exception)" "${log_file}" | tail -3
            elif grep -q "Finished: ABORTED" "${log_file}"; then
                echo "  Result: ABORTED"
            else
                echo "  Result: RUNNING or UNKNOWN"
            fi
            
            # 构建时长
            local duration=$(grep "Total time:" "${log_file}" | tail -1)
            if [ -n "${duration}" ]; then
                echo "  ${duration}"
            fi
            
            echo ""
        fi
    done
}

# ==================== 分析GC日志 ====================
analyze_gc_log() {
    local gc_log="${1:-/var/log/jenkins/gc.log}"
    
    if [ ! -f "${gc_log}" ]; then
        echo "GC log not found: ${gc_log}"
        return
    fi
    
    echo "=== GC Log Analysis ==="
    echo ""
    
    # 统计GC次数
    local gc_count=$(grep -c "GC pause" "${gc_log}" 2>/dev/null || echo "0")
    echo "Total GC Count: ${gc_count}"
    
    # Full GC次数
    local full_gc_count=$(grep -c "Full GC" "${gc_log}" 2>/dev/null || echo "0")
    echo "Full GC Count: ${full_gc_count}"
    
    # 平均暂停时间
    echo ""
    echo "Pause Time Analysis:"
    grep "GC pause" "${gc_log}" | \
        grep -oP '\d+\.\d+ ms' | \
        awk '{sum+=$1; count++} END {print "Average: " sum/count " ms"}'
    
    # 最大暂停时间
    grep "GC pause" "${gc_log}" | \
        grep -oP '\d+\.\d+ ms' | \
        sort -n | tail -1 | \
        awk '{print "Max: " $1}'
}

6.2 线程Dump分析

#!/bin/bash
# 获取Jenkins线程Dump

# ==================== 获取线程Dump ====================
get_thread_dump() {
    local pid=$(pgrep -f "jenkins.war")
    
    if [ -z "${pid}" ]; then
        echo "Jenkins process not found"
        return 1
    fi
    
    echo "Jenkins PID: ${pid}"
    echo "Getting thread dump..."
    
    # 使用jstack获取线程dump
    jstack "${pid}" > /tmp/jenkins_thread_dump_$(date +%Y%m%d_%H%M%S).txt
    
    echo "Thread dump saved to /tmp/jenkins_thread_dump_*.txt"
}

# ==================== 分析线程Dump ====================
analyze_thread_dump() {
    local dump_file=$1
    
    echo "=== Thread Dump Analysis ==="
    echo ""
    
    # 线程状态统计
    echo "Thread State Distribution:"
    grep "java.lang.Thread.State" "${dump_file}" | \
        sort | uniq -c | sort -rn
    
    # 阻塞线程
    echo ""
    echo "Blocked Threads:"
    grep -B 5 "BLOCKED" "${dump_file}" | grep "^\"" | head -10
    
    # 等待线程
    echo ""
    echo "Waiting Threads:"
    grep -B 5 "WAITING" "${dump_file}" | grep "^\"" | head -10
    
    # CPU密集线程
    echo ""
    echo "Runnable Threads:"
    grep -B 5 "RUNNABLE" "${dump_file}" | grep "^\"" | head -10
}

# ==================== 实时监控 ====================
monitor_threads() {
    local pid=$(pgrep -f "jenkins.war")
    
    while true; do
        clear
        echo "=== Jenkins Thread Monitor ==="
        echo "Time: $(date)"
        echo ""
        
        # 线程数
        echo "Thread Count: $(ps -o nlwp -p ${pid} | tail -1)"
        
        # CPU使用
        echo "CPU Usage: $(ps -p ${pid} -o %cpu | tail -1)%"
        
        # 内存使用
        echo "Memory Usage: $(ps -p ${pid} -o %mem | tail -1)%"
        
        sleep 5
    done
}

7. 灾难恢复计划

7.1 灾难恢复流程

灾难恢复流程

部分故障

完全故障

数据中心故障

灾难发生

评估影响

局部恢复

完整恢复

异地恢复

恢复服务

恢复备份

切换到灾备站点

验证功能

通知用户

恢复完成

7.2 灾难恢复计划文档

# 灾难恢复计划
# 文件: disaster_recovery_plan.yaml

disaster_recovery:
  
  # 1. 响应团队
  response_team:
    primary:
      - role: "Incident Commander"
        name: "Team Lead"
        contact: "+1-xxx-xxx-xxxx"
      - role: "Technical Lead"
        name: "DevOps Engineer"
        contact: "+1-xxx-xxx-xxxx"
      - role: "Communication Lead"
        name: "Project Manager"
        contact: "+1-xxx-xxx-xxxx"
    
    escalation:
      - level: 1
        response_time: "15 minutes"
        contacts: ["Primary Team"]
      - level: 2
        response_time: "30 minutes"
        contacts: ["Management", "External Support"]
  
  # 2. 恢复目标
  recovery_objectives:
    RTO: "4 hours"  # 恢复时间目标
    RPO: "1 hour"   # 恢复点目标
    
    priority_order:
      - "Critical: Production deployment jobs"
      - "High: CI/CD pipelines"
      - "Medium: Development jobs"
      - "Low: Historical build data"
  
  # 3. 恢复步骤
  recovery_procedures:
    
    partial_failure:
      description: "Single component failure"
      steps:
        - "Identify failed component"
        - "Check monitoring alerts"
        - "Review recent changes"
        - "Apply fix or rollback"
        - "Verify functionality"
      estimated_time: "30 minutes"
    
    complete_failure:
      description: "Jenkins master failure"
      steps:
        - "Notify response team"
        - "Provision new server"
        - "Restore from latest backup"
        - "Verify configuration"
        - "Start Jenkins service"
        - "Verify all jobs"
        - "Notify users"
      estimated_time: "4 hours"
    
    data_center_failure:
      description: "Primary data center unavailable"
      steps:
        - "Activate disaster recovery site"
        - "Update DNS to DR site"
        - "Restore from offsite backup"
        - "Verify all integrations"
        - "Notify all stakeholders"
      estimated_time: "8 hours"
  
  # 4. 通信计划
  communication:
    internal:
      - channel: "Slack #jenkins-incidents"
        template: "incident_notification.md"
      - channel: "Email"
        recipients: ["dev-team@example.com"]
    
    external:
      - channel: "Status Page"
        url: "https://status.example.com"
      - channel: "Customer Email"
        template: "customer_notification.md"
  
  # 5. 验证清单
  verification_checklist:
    - "Jenkins web UI accessible"
    - "All agents connected"
    - "Jobs can be triggered"
    - "SCM integration working"
    - "Notifications working"
    - "Plugins loaded correctly"
    - "Credentials accessible"
    - "Recent builds visible"
  
  # 6. 事后复盘
  post_incident:
    timeline: "Within 24 hours"
    participants: ["Response Team", "Management"]
    deliverables:
      - "Incident timeline"
      - "Root cause analysis"
      - "Action items"
      - "Updated runbooks"

7.3 恢复验证脚本

/**
 * Jenkins恢复验证脚本
 */

def verifyRecovery() {
    def jenkins = Jenkins.getInstance()
    def issues = []
    
    println "=" * 60
    println "Jenkins Recovery Verification"
    println "=" * 60
    println ""
    
    // ==================== 1. 基础服务检查 ====================
    println "1. Basic Service Check..."
    
    try {
        def url = new URL("${jenkins.rootUrl}api/json")
        def connection = url.openConnection()
        connection.requestMethod = "GET"
        connection.connectTimeout = 5000
        
        if (connection.responseCode == 200) {
            println "   [PASS] API is responding"
        } else {
            issues << "API returned ${connection.responseCode}"
            println "   [FAIL] API returned ${connection.responseCode}"
        }
    } catch (Exception e) {
        issues << "API check failed: ${e.message}"
        println "   [FAIL] API check failed: ${e.message}"
    }
    
    // ==================== 2. 任务检查 ====================
    println "2. Job Check..."
    
    def jobs = jenkins.items
    def totalJobs = jobs.size()
    def validJobs = jobs.count { it instanceof Job && it.lastBuild != null }
    
    println "   Total Jobs: ${totalJobs}"
    println "   Jobs with builds: ${validJobs}"
    
    if (validJobs < totalJobs * 0.9) {
        issues << "Some jobs may be corrupted"
    }
    
    // ==================== 3. Agent检查 ====================
    println "3. Agent Check..."
    
    def computers = jenkins.computers
    def onlineAgents = computers.count { !it.offline }
    def totalAgents = computers.size()
    
    println "   Total Agents: ${totalAgents}"
    println "   Online Agents: ${onlineAgents}"
    
    if (onlineAgents < totalAgents * 0.8) {
        issues << "Many agents are offline"
    }
    
    // ==================== 4. 凭证检查 ====================
    println "4. Credential Check..."
    
    try {
        def credentialsProvider = jenkins.getExtensionList(
            'com.cloudbees.plugins.credentials.SystemCredentialsProvider'
        )[0]
        
        def credentials = credentialsProvider.credentials
        println "   Credentials count: ${credentials.size()}"
        println "   [PASS] Credentials accessible"
    } catch (Exception e) {
        issues << "Credentials check failed: ${e.message}"
        println "   [FAIL] Credentials check failed"
    }
    
    // ==================== 5. 插件检查 ====================
    println "5. Plugin Check..."
    
    def pluginManager = jenkins.pluginManager
    def failedPlugins = pluginManager.plugins.findAll { !it.active }
    
    if (failedPlugins.isEmpty()) {
        println "   [PASS] All plugins active"
    } else {
        issues << "Failed plugins: ${failedPlugins.collect { it.shortName }}"
        println "   [FAIL] ${failedPlugins.size()} plugins failed"
    }
    
    // ==================== 6. SCM检查 ====================
    println "6. SCM Integration Check..."
    
    // 检查Git插件
    try {
        def gitSCM = Class.forName('hudson.plugins.git.GitSCM')
        println "   [PASS] Git plugin loaded"
    } catch (ClassNotFoundException e) {
        issues << "Git plugin not loaded"
        println "   [FAIL] Git plugin not loaded"
    }
    
    // ==================== 7. 队列检查 ====================
    println "7. Queue Check..."
    
    def queue = jenkins.queue
    if (queue.items.size() < 50) {
        println "   [PASS] Queue size normal: ${queue.items.size()}"
    } else {
        issues << "Queue is backing up: ${queue.items.size()} items"
        println "   [WARN] Queue size: ${queue.items.size()}"
    }
    
    // ==================== 8. 磁盘检查 ====================
    println "8. Disk Check..."
    
    def jenkinsHome = jenkins.rootDir
    def freeSpace = jenkinsHome.freeSpace / 1024 / 1024 / 1024
    
    if (freeSpace > 10) {
        println "   [PASS] Free space: ${freeSpace.round(2)} GB"
    } else {
        issues << "Low disk space: ${freeSpace.round(2)} GB"
        println "   [WARN] Free space: ${freeSpace.round(2)} GB"
    }
    
    // ==================== 结果汇总 ====================
    println ""
    println "=" * 60
    println "Verification Summary"
    println "=" * 60
    
    if (issues.isEmpty()) {
        println "All checks passed. Recovery successful!"
    } else {
        println "Issues found:"
        issues.each { println "  - ${it}" }
        println ""
        println "Action required: Review and resolve the above issues"
    }
}

verifyRecovery()

总结

本文深入讲解了Jenkins备份恢复与故障排查,涵盖以下关键知识点:

主题 核心要点
备份策略 数据分类、备份类型、保留策略
备份实施 ThinBackup、自定义脚本、定时配置
恢复流程 完整恢复、部分恢复、验证流程
高可用 主备架构、Kubernetes部署、自动切换
故障诊断 故障分类、诊断脚本、分析流程
排查工具 日志分析、线程Dump、监控工具
灾难恢复 恢复计划、通信流程、验证清单
Logo

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。

更多推荐