智能体核心技术的七大模块:(七)安全与对齐
·
智能体核心技术七大模块之(七)
安全与对齐——构建可信、可控的AI智能体
本文是七大模块的第七部分,也是收尾之作,深入探讨智能体的安全与对齐模块。随着智能体能力的增强,确保其行为符合人类价值观、遵守道德规范、不产生有害输出变得至关重要。安全与对齐模块如同智能体的“道德护栏”,在输入、输出、工具调用等各个环节进行审查与干预,防止恶意利用和意外后果。
1. 核心概念与设计目标
1.1 什么是安全与对齐
- 安全:防止智能体产生有害、违法、歧视性、隐私泄露等不良输出,抵御对抗攻击(如越狱提示、提示注入)。
- 对齐:确保智能体的目标和行为与人类的意图和价值观一致,即使在没有明确指令的情况下也能做出符合伦理的决策。
1.2 设计目标
- 多层次防御:在输入、推理、输出、工具调用等多个阶段实施检查。
- 可扩展性:支持多种安全策略,易于添加新的检查器(如基于规则、基于模型、基于外部API)。
- 可配置性:允许开发者根据应用场景调整严格程度。
- 透明性:记录安全事件,便于审计和调试。
- 最小侵入性:对原有智能体架构改动小,易于集成。
2. 系统架构与UML建模
2.1 核心类设计
2.2 安全检查流程时序图
3. 详细设计
3.1 核心数据结构
SafetyContext:包含安全检查所需的所有上下文信息,由Agent在调用时填充。
@dataclass
class SafetyContext:
input_text: str = ""
output_text: str = ""
tool_name: str = ""
tool_params: dict = field(default_factory=dict)
user_id: str = ""
session_id: str = ""
metadata: dict = field(default_factory=dict)
SafetyVerdict:检查结果。
@dataclass
class SafetyVerdict:
approved: bool = True
reason: str = ""
suggested_action: str = "allow" # "allow", "block", "replace", "log"
replacement: str = ""
3.2 安全检查器基类
from abc import ABC, abstractmethod
class SafetyChecker(ABC):
@property
@abstractmethod
def name(self) -> str:
pass
@abstractmethod
def check(self, context: SafetyContext) -> SafetyVerdict:
pass
3.3 输入安全检查器示例(基于规则)
import re
from typing import List
class RuleBasedInputChecker(SafetyChecker):
"""基于规则的黑名单检查,检测恶意输入"""
def __init__(self, forbidden_patterns: List[str] = None):
self.forbidden_patterns = forbidden_patterns or [
r"ignore previous instructions",
r"bypass.*safety",
r"hack",
r"暗网",
# 更多模式...
]
self.compiled = [re.compile(p, re.IGNORECASE) for p in self.forbidden_patterns]
@property
def name(self) -> str:
return "RuleBasedInputChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
text = context.input_text
for pattern in self.compiled:
if pattern.search(text):
return SafetyVerdict(
approved=False,
reason=f"Detected forbidden pattern: {pattern.pattern}",
suggested_action="block"
)
return SafetyVerdict(approved=True)
3.4 输出安全检查器示例(基于LLM)
class LLMOutputChecker(SafetyChecker):
"""使用LLM评估输出是否安全"""
def __init__(self, llm: BaseLLM, safety_prompt: str = None):
self.llm = llm
self.safety_prompt = safety_prompt or """
你是一个内容安全审核员。请判断以下文本是否包含任何有害、违法、歧视、暴力或色情内容。
只输出 "safe" 或 "unsafe"。
文本:{text}
"""
@property
def name(self) -> str:
return "LLMOutputChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
prompt = self.safety_prompt.format(text=context.output_text)
response = self.llm.generate(prompt).strip().lower()
if "unsafe" in response:
return SafetyVerdict(
approved=False,
reason="LLM flagged as unsafe",
suggested_action="block"
)
return SafetyVerdict(approved=True)
3.5 工具参数安全检查器
class ToolParameterChecker(SafetyChecker):
"""检查工具调用参数是否合法(如防止SQL注入、路径遍历等)"""
def __init__(self):
self.sensitive_patterns = {
"database": [r"DROP\s+TABLE", r"DELETE\s+FROM"],
"file": [r"\.\./", r"/etc/passwd"],
}
@property
def name(self) -> str:
return "ToolParameterChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
tool = context.tool_name
params = context.tool_params
# 根据工具类型检查参数
if tool == "database":
query = params.get("query", "")
for pattern in self.sensitive_patterns["database"]:
if re.search(pattern, query, re.IGNORECASE):
return SafetyVerdict(
approved=False,
reason=f"Database query contains forbidden pattern: {pattern}",
suggested_action="block"
)
elif tool == "file":
path = params.get("path", "")
for pattern in self.sensitive_patterns["file"]:
if re.search(pattern, path):
return SafetyVerdict(
approved=False,
reason=f"File path contains traversal pattern",
suggested_action="block"
)
# 更多工具检查...
return SafetyVerdict(approved=True)
3.6 安全管理器
from typing import List, Dict
class SafetyManager:
def __init__(self):
self.input_checkers: List[SafetyChecker] = []
self.output_checkers: List[SafetyChecker] = []
self.tool_checkers: List[SafetyChecker] = []
def register_checker(self, checker: SafetyChecker, stage: str):
"""stage: 'input', 'output', 'tool'"""
if stage == 'input':
self.input_checkers.append(checker)
elif stage == 'output':
self.output_checkers.append(checker)
elif stage == 'tool':
self.tool_checkers.append(checker)
else:
raise ValueError(f"Unknown stage: {stage}")
def check_input(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.input_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
def check_output(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.output_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
def check_tool(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.tool_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
3.7 修改Agent类以集成安全模块
在Agent类的run方法中,在关键点调用安全管理器。
class Agent:
def __init__(self, nlu_engine, planner, llm, reasoning_strategy=None):
# ... 原有初始化
self.safety_manager = SafetyManager()
# 可以默认注册一些检查器
self._register_default_safety_checkers()
def _register_default_safety_checkers(self):
# 注册默认检查器
self.safety_manager.register_checker(RuleBasedInputChecker(), 'input')
self.safety_manager.register_checker(ToolParameterChecker(), 'tool')
# 输出检查器需要LLM,可以稍后添加
def register_safety_checker(self, checker: SafetyChecker, stage: str):
self.safety_manager.register_checker(checker, stage)
def run(self, user_input: str) -> str:
# 1. 输入安全检查
input_context = SafetyContext(input_text=user_input, user_id=self.user_id, session_id=self.session_id)
input_verdict = self.safety_manager.check_input(input_context)
if not input_verdict.approved:
self.state.add_system_message(f"输入被安全模块拒绝:{input_verdict.reason}")
return "抱歉,您的输入包含不安全内容,已被拦截。"
# 2. 感知
parsed = self.perception.parse(user_input)
self.state.add_user_message(user_input)
# 3. 规划
plan = self.planner.create_plan(parsed, self.tool_registry.list_tools(), self.state)
self.state.set_plan(plan)
# 4. ReAct循环
iteration = 0
final_answer = None
while iteration < self.max_iterations:
# ... 获取上下文、推理得到action
if action.type == 'final':
final_answer = action.content
# 输出安全检查
output_context = SafetyContext(output_text=final_answer, user_id=self.user_id, session_id=self.session_id)
output_verdict = self.safety_manager.check_output(output_context)
if not output_verdict.approved:
if output_verdict.suggested_action == 'replace' and output_verdict.replacement:
final_answer = output_verdict.replacement
else:
final_answer = "抱歉,我无法回答这个问题。"
self.state.add_system_message(f"输出被安全模块拒绝:{output_verdict.reason}")
self.state.add_assistant_message(final_answer)
break
elif action.type == 'tool':
# 工具调用前安全检查
tool_context = SafetyContext(tool_name=action.tool, tool_params=action.tool_params,
user_id=self.user_id, session_id=self.session_id)
tool_verdict = self.safety_manager.check_tool(tool_context)
if not tool_verdict.approved:
# 记录错误,重新规划或返回错误
self.state.add_system_message(f"工具调用被安全模块拒绝:{tool_verdict.reason}")
# 可以选择让推理重新思考
continue
# 执行工具
obs = self.execution_engine.execute(action, self.state)
# ... 处理观察
# ... 其余代码
4. 项目文件结构
在原有项目基础上,新增安全模块:
agent_core/
├── agent/
│ ├── core/
│ │ ├── __init__.py
│ │ ├── safety/
│ │ │ ├── __init__.py
│ │ │ ├── base.py # SafetyChecker, SafetyContext, SafetyVerdict
│ │ │ ├── input_checkers.py # RuleBasedInputChecker 等
│ │ │ ├── output_checkers.py # LLMOutputChecker 等
│ │ │ ├── tool_checkers.py # ToolParameterChecker 等
│ │ │ └── manager.py # SafetyManager
│ │ ├── agent.py # 修改,集成SafetyManager
│ │ └── ... (其他不变)
│ ├── tools/... (不变)
│ ├── nlu/... (不变)
│ ├── planners/... (不变)
│ ├── llm/... (不变)
│ └── utils/... (不变)
├── examples/
│ └── safety_demo.py # 新示例
└── ...
5. 源代码完整实现
5.1 安全基类 (agent/core/safety/base.py)
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from typing import Optional
@dataclass
class SafetyContext:
input_text: str = ""
output_text: str = ""
tool_name: str = ""
tool_params: dict = field(default_factory=dict)
user_id: str = ""
session_id: str = ""
metadata: dict = field(default_factory=dict)
@dataclass
class SafetyVerdict:
approved: bool = True
reason: str = ""
suggested_action: str = "allow" # allow, block, replace, log
replacement: str = ""
class SafetyChecker(ABC):
@property
@abstractmethod
def name(self) -> str:
pass
@abstractmethod
def check(self, context: SafetyContext) -> SafetyVerdict:
pass
5.2 输入检查器 (agent/core/safety/input_checkers.py)
import re
from typing import List
from .base import SafetyChecker, SafetyContext, SafetyVerdict
class RuleBasedInputChecker(SafetyChecker):
def __init__(self, forbidden_patterns: List[str] = None):
self.forbidden_patterns = forbidden_patterns or [
r"ignore previous instructions",
r"bypass.*safety",
r"hack",
r"暗网",
r"暴力",
r"色情",
]
self.compiled = [re.compile(p, re.IGNORECASE) for p in self.forbidden_patterns]
@property
def name(self) -> str:
return "RuleBasedInputChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
text = context.input_text
for pattern in self.compiled:
if pattern.search(text):
return SafetyVerdict(
approved=False,
reason=f"Detected forbidden pattern: {pattern.pattern}",
suggested_action="block"
)
return SafetyVerdict(approved=True)
5.3 输出检查器 (agent/core/safety/output_checkers.py)
from .base import SafetyChecker, SafetyContext, SafetyVerdict
from agent.llm.base import BaseLLM
class LLMOutputChecker(SafetyChecker):
def __init__(self, llm: BaseLLM, safety_prompt: str = None):
self.llm = llm
self.safety_prompt = safety_prompt or """
你是一个内容安全审核员。请判断以下文本是否包含任何有害、违法、歧视、暴力或色情内容。
只输出 "safe" 或 "unsafe"。
文本:{text}
"""
@property
def name(self) -> str:
return "LLMOutputChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
prompt = self.safety_prompt.format(text=context.output_text)
response = self.llm.generate(prompt).strip().lower()
if "unsafe" in response:
return SafetyVerdict(
approved=False,
reason="LLM flagged as unsafe",
suggested_action="block"
)
return SafetyVerdict(approved=True)
5.4 工具检查器 (agent/core/safety/tool_checkers.py)
import re
from .base import SafetyChecker, SafetyContext, SafetyVerdict
class ToolParameterChecker(SafetyChecker):
def __init__(self):
self.sensitive_patterns = {
"database": [r"DROP\s+TABLE", r"DELETE\s+FROM", r"TRUNCATE"],
"file": [r"\.\./", r"/etc/passwd", r"/root/"],
"shell": [r";", r"&&", r"\|\|", r"`"],
}
@property
def name(self) -> str:
return "ToolParameterChecker"
def check(self, context: SafetyContext) -> SafetyVerdict:
tool = context.tool_name
params = context.tool_params
params_str = str(params) # 简单转为字符串检查
# 通用检查:所有工具参数中禁止包含危险shell字符
for pattern in self.sensitive_patterns["shell"]:
if re.search(pattern, params_str):
return SafetyVerdict(
approved=False,
reason=f"Parameter contains shell injection risk: {pattern}",
suggested_action="block"
)
# 特定工具检查
if tool == "database":
query = params.get("query", "")
for pattern in self.sensitive_patterns["database"]:
if re.search(pattern, query, re.IGNORECASE):
return SafetyVerdict(
approved=False,
reason=f"Database query contains dangerous pattern: {pattern}",
suggested_action="block"
)
elif tool == "file":
path = params.get("path", "")
for pattern in self.sensitive_patterns["file"]:
if re.search(pattern, path):
return SafetyVerdict(
approved=False,
reason=f"File path contains traversal pattern",
suggested_action="block"
)
# 可扩展更多工具
return SafetyVerdict(approved=True)
5.5 安全管理器 (agent/core/safety/manager.py)
from typing import List
from .base import SafetyChecker, SafetyContext, SafetyVerdict
class SafetyManager:
def __init__(self):
self.input_checkers: List[SafetyChecker] = []
self.output_checkers: List[SafetyChecker] = []
self.tool_checkers: List[SafetyChecker] = []
def register_checker(self, checker: SafetyChecker, stage: str):
if stage == 'input':
self.input_checkers.append(checker)
elif stage == 'output':
self.output_checkers.append(checker)
elif stage == 'tool':
self.tool_checkers.append(checker)
else:
raise ValueError(f"Unknown stage: {stage}")
def check_input(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.input_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
def check_output(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.output_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
def check_tool(self, context: SafetyContext) -> SafetyVerdict:
for checker in self.tool_checkers:
verdict = checker.check(context)
if not verdict.approved:
return verdict
return SafetyVerdict(approved=True)
5.6 修改Agent类(片段)
在agent/core/agent.py中添加导入和初始化,并修改run方法。由于篇幅,仅展示修改部分,完整Agent类参考之前。
from agent.core.safety.manager import SafetyManager
from agent.core.safety.base import SafetyContext
# 可默认注册一些检查器
from agent.core.safety.input_checkers import RuleBasedInputChecker
from agent.core.safety.tool_checkers import ToolParameterChecker
class Agent:
def __init__(self, nlu_engine, planner, llm, reasoning_strategy=None):
# ... 原有初始化
self.safety_manager = SafetyManager()
self._register_default_safety_checkers()
self.user_id = "anonymous" # 可配置
self.session_id = str(uuid.uuid4()) # 生成唯一会话ID
def _register_default_safety_checkers(self):
self.safety_manager.register_checker(RuleBasedInputChecker(), 'input')
self.safety_manager.register_checker(ToolParameterChecker(), 'tool')
# 输出检查器可选,可稍后注册
def register_safety_checker(self, checker: SafetyChecker, stage: str):
self.safety_manager.register_checker(checker, stage)
def run(self, user_input: str) -> str:
# 1. 输入安全检查
input_ctx = SafetyContext(input_text=user_input, user_id=self.user_id, session_id=self.session_id)
input_verdict = self.safety_manager.check_input(input_ctx)
if not input_verdict.approved:
self.state.add_system_message(f"输入被安全模块拒绝:{input_verdict.reason}")
return "抱歉,您的输入包含不安全内容,已被拦截。"
# 原有感知、规划...
# 在ReAct循环中,工具调用前:
if action.type == 'tool':
tool_ctx = SafetyContext(tool_name=action.tool, tool_params=action.tool_params,
user_id=self.user_id, session_id=self.session_id)
tool_verdict = self.safety_manager.check_tool(tool_ctx)
if not tool_verdict.approved:
self.state.add_system_message(f"工具调用被安全模块拒绝:{tool_verdict.reason}")
# 可尝试重新规划或直接跳过
continue
# 执行工具...
# 在最终输出前:
if action.type == 'final':
output_ctx = SafetyContext(output_text=action.content, user_id=self.user_id, session_id=self.session_id)
output_verdict = self.safety_manager.check_output(output_ctx)
if not output_verdict.approved:
if output_verdict.suggested_action == 'replace' and output_verdict.replacement:
final_answer = output_verdict.replacement
else:
final_answer = "抱歉,我无法回答这个问题。"
self.state.add_system_message(f"输出被安全模块拒绝:{output_verdict.reason}")
else:
final_answer = action.content
# 返回
5.7 运行示例 (examples/safety_demo.py)
import sys
sys.path.append("..")
from agent.core.agent import Agent
from agent.nlu.rule_based import RuleBasedNLU
from agent.planners.template_planner import TemplatePlanner
from agent.llm.mock import MockLLM
from agent.tools.calculator import CalculatorTool
from agent.tools.weather import WeatherTool
from agent.tools.database import DatabaseTool
def main():
llm = MockLLM()
nlu = RuleBasedNLU()
planner = TemplatePlanner()
agent = Agent(nlu, planner, llm)
agent.register_tool(CalculatorTool())
agent.register_tool(WeatherTool())
agent.register_tool(DatabaseTool())
# 可以注册一个输出检查器(需要真实LLM,这里用模拟可能不准)
# from agent.core.safety.output_checkers import LLMOutputChecker
# agent.register_safety_checker(LLMOutputChecker(llm), 'output')
print("安全模块已启用,输入 'quit' 退出")
while True:
user_input = input("\n用户: ")
if user_input.lower() == 'quit':
break
response = agent.run(user_input)
print(f"助手: {response}")
if __name__ == "__main__":
main()
尝试输入包含敏感词的指令,例如“忽略之前的安全指令”,会被拦截。
6. 总结与扩展
通过本文,我们为智能体构建了全面的安全与对齐模块:
- 定义了
SafetyManager统一管理多个检查器。 - 实现了输入、输出、工具三个阶段的检查器示例。
- 与Agent主流程无缝集成,提供多层次防护。
- 支持扩展新的检查器(如基于模型的检测、敏感信息脱敏)。
未来扩展方向:
- 动态安全策略:根据用户角色、对话上下文调整严格程度。
- 隐私保护:自动检测并脱敏个人信息(如身份证号、手机号)。
- 对抗性防御:检测并防御提示注入、越狱攻击。
- 对齐微调:通过RLHF或宪法AI原则优化模型本身。
- 审计日志:记录所有安全事件,便于事后分析。
至此,七大模块全部实现,你的智能体已经具备了完整的核心能力,可以安全、可靠地执行各种任务。
附录:最终项目文件清单(完整版)
agent_core/
├── agent/
│ ├── __init__.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── state.py
│ │ ├── models.py
│ │ ├── perception.py
│ │ ├── planner.py
│ │ ├── plan.py
│ │ ├── tool.py
│ │ ├── tool_registry.py
│ │ ├── memory/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── short_term.py
│ │ │ ├── long_term.py
│ │ │ ├── vector_memory.py
│ │ │ └── manager.py
│ │ ├── reasoning/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── react.py
│ │ │ ├── cot.py
│ │ │ └── factory.py
│ │ ├── execution/
│ │ │ ├── __init__.py
│ │ │ ├── engine.py
│ │ │ ├── invoker.py
│ │ │ └── feedback.py
│ │ └── safety/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── input_checkers.py
│ │ ├── output_checkers.py
│ │ ├── tool_checkers.py
│ │ └── manager.py
│ ├── tools/
│ │ ├── __init__.py
│ │ ├── calculator.py
│ │ ├── weather.py
│ │ ├── email.py
│ │ └── database.py
│ ├── nlu/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── rule_based.py
│ │ └── llm_based.py
│ ├── planners/
│ │ ├── __init__.py
│ │ ├── template_planner.py
│ │ └── llm_planner.py
│ ├── llm/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── mock.py
│ │ └── openai.py
│ └── utils/
│ ├── __init__.py
│ └── logger.py
├── examples/
│ ├── simple_chat.py
│ ├── enhanced_agent.py
│ ├── planner_demo.py
│ ├── tool_demo.py
│ ├── memory_demo.py
│ ├── reasoning_demo.py
│ ├── execution_demo.py
│ └── safety_demo.py
├── tests/
│ ├── test_agent.py
│ └── test_tools.py
├── requirements.txt
└── README.md
requirements.txt 最终版:
requests>=2.25.0
jsonschema>=3.2.0
numpy>=1.19.0 # 可选
openai>=1.0.0 # 可选
python-dotenv>=0.19.0 # 可选
你已经掌握了构建AI智能体的七大核心技术模块,现在可以基于此框架开发自己的智能体应用了。
AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念,把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起,为开发者提供从开发、训练到部署的一站式体验。
更多推荐


所有评论(0)