《流畅的Python》读书笔记04(补充01): 字典和集合 - Python 3 中字典与集合类型全解析

andylin02

186人浏览 · 2026-05-19 20:21:03

andylin02 · 2026-05-19 20:21:03 发布

一、类型架构总览

二、核心特性对比表

类型	可变性	有序性	键/元素要求	主要用途	时间复杂度（查找）
dict	可变	Python 3.7+保持插入顺序	键必须可散列	键值对映射存储	O(1)
set	可变	无序	元素必须可散列	去重、集合运算	O(1)
frozenset	不可变	无序	元素必须可散列	作为字典键或集合元素	O(1)
defaultdict	可变	同dict	键必须可散列	为缺失键提供默认值	O(1)
OrderedDict	可变	严格保持插入顺序	键必须可散列	需要顺序敏感的映射	O(1)
Counter	可变	同dict	键必须可散列	元素频率统计	O(1)
ChainMap	可变	按映射顺序	键必须可散列	多层映射视图	O(k)*

注：ChainMap查找时间复杂度为O(k)，k为链中映射数量[ref_1]

三、底层实现原理图

散列表（Hash Table）结构

散列表结构示意图：

索引:   0      1      2      3      4      5      6
      +-----+-----+-----+-----+-----+-----+-----+
      |     |     |     |     |     |     |     |
      | 空  | 键值对| 冲突链| 空  | 键值对| 冲突链| 空  |
      |     |     |     |     |     |     |     |
      +-----+-----+-----+-----+-----+-----+-----+
              |           |
              |           +--> 冲突解决（开放寻址或链表）
              |
              v
        键: "name"
        值: "Alice"
        哈希: hash("name") % 7 = 1

字典和集合都基于散列表实现，这决定了它们的关键特性：

O(1)平均时间复杂度的查找、插入和删除
内存开销较大（需要预留空槽减少冲突）
键/元素必须是可散列对象

四、使用场景与实例代码

1. 基础字典操作

# 字典创建方式对比 
# 方式1：字面量（性能最佳）
dict1 = {'name': 'Alice', 'age': 25, 'city': 'New York'}

# 方式2：dict()构造器
dict2 = dict(name='Bob', age=30)

# 方式3：zip两个序列
keys = ['a', 'b', 'c']
values = [1, 2, 3]
dict3 = dict(zip(keys, values))

# 方式4：字典推导式
squares = {x: x**2 for x in range(1, 6)}
print(squares)  # {1: 1, 2: 4, 3: 9, 4: 16, 5: 25}

# 字典推导式流程图 
# 输入 → 遍历可迭代对象 → 条件过滤 → 构造键值对 → 输出字典

2. 集合操作与运算

# 集合创建与基本操作
set1 = {1, 2, 3, 4, 5}
set2 = {4, 5, 6, 7, 8}

# 集合运算
print("并集:", set1 | set2)           # {1, 2, 3, 4, 5, 6, 7, 8}
print("交集:", set1 & set2)           # {4, 5}
print("差集:", set1 - set2)           # {1, 2, 3}
print("对称差集:", set1 ^ set2)       # {1, 2, 3, 6, 7, 8}

# 集合成员检测（O(1)时间复杂度）
if 3 in set1:
    print("3在集合中")

# 集合推导式
even_squares = {x**2 for x in range(10) if x % 2 == 0}
print(even_squares)  # {0, 4, 16, 36, 64}

3. 字典变种应用实例

from collections import defaultdict, OrderedDict, Counter, ChainMap

# defaultdict：自动处理缺失键 
def defaultdict_example():
    # 按首字母分组单词
    words = ['apple', 'banana', 'apricot', 'blueberry', 'cherry']
    grouped = defaultdict(list)
    
    for word in words:
        grouped[word[0]].append(word)
    
    print("分组结果:", dict(grouped))
    # {'a': ['apple', 'apricot'], 'b': ['banana', 'blueberry'], 'c': ['cherry']}

# OrderedDict：顺序敏感操作
def ordereddict_example():
    od = OrderedDict()
    od['z'] = 1
    od['a'] = 2
    od['b'] = 3
    
    print("原始顺序:", list(od.keys()))  # ['z', 'a', 'b']
    
    # 移动元素到末尾
    od.move_to_end('z')
    print("移动后:", list(od.keys()))    # ['a', 'b', 'z']
    
    # FIFO队列行为
    oldest = od.popitem(last=False)
    print("移除最早项:", oldest)        # ('a', 2)

# Counter：频率统计 
def counter_example():
    text = "abracadabra"
    freq = Counter(text)
    
    print("字符频率:", freq)
    # Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
    
    print("最常见2个:", freq.most_common(2))
    # [('a', 5), ('b', 2)]
    
    # Counter支持算术运算
    another = Counter("abcz")
    print("合并计数:", freq + another)
    # Counter({'a': 6, 'b': 3, 'c': 2, 'd': 1, 'r': 2, 'z': 1})

# ChainMap：多层配置管理 
def chainmap_example():
    # 模拟配置优先级：命令行参数 > 用户配置 > 默认配置
    defaults = {'color': 'red', 'size': 'medium'}
    user_prefs = {'color': 'blue', 'theme': 'dark'}
    command_line = {'size': 'large', 'verbose': True}
    
    config = ChainMap(command_line, user_prefs, defaults)
    
    print("颜色配置:", config['color'])     # 'blue'（用户配置优先）
    print("尺寸配置:", config['size'])      # 'large'（命令行参数优先）
    print("主题配置:", config['theme'])     # 'dark'（用户配置）
    print("详细模式:", config.get('verbose', False))  # True

# 执行示例
defaultdict_example()
ordereddict_example()
counter_example()
chainmap_example()

4. 可散列性验证

# 验证不同数据类型的可散列性 
def check_hashable():
    # 可散列的类型
    hashable_types = [
        "string",          # str
        42,                # int
        3.14,              # float
        (1, 2, 3),         # 元组（元素全部可散列）
        frozenset([1, 2]), # frozenset
    ]
    
    # 不可散列的类型
    unhashable_types = [
        [1, 2, 3],         # list
        {1, 2, 3},         # set
        {'a': 1},          # dict
        (1, [2, 3]),       # 包含不可散列元素的元组
    ]
    
    print("可散列类型测试:")
    for item in hashable_types:
        try:
            hash_val = hash(item)
            print(f"  {type(item).__name__}: {item} -> 散列值: {hash_val}")
        except TypeError as e:
            print(f"  {type(item).__name__}: {item} -> 错误: {e}")
    
    print("\n不可散列类型测试:")
    for item in unhashable_types:
        try:
            hash_val = hash(item)
            print(f"  {type(item).__name__}: {item} -> 散列值: {hash_val}")
        except TypeError as e:
            print(f"  {type(item).__name__}: {item} -> 错误: {e}")

check_hashable()

五、性能优化建议

1. 字典操作性能对比

import timeit

def performance_comparison():
    # 测试不同方式处理缺失键的性能
    setup = """
from collections import defaultdict
data = [('a', 1), ('b', 2), ('a', 3), ('c', 4), ('b', 5)]
"""
    
    # 方法1：使用setdefault
    stmt1 = """
result = {}
for key, value in data:
    result.setdefault(key, []).append(value)
"""
    
    # 方法2：使用defaultdict
    stmt2 = """
result = defaultdict(list)
for key, value in data:
    result[key].append(value)
"""
    
    # 方法3：手动检查
    stmt3 = """
result = {}
for key, value in data:
    if key not in result:
        result[key] = []
    result[key].append(value)
"""
    
    t1 = timeit.timeit(stmt1, setup, number=100000)
    t2 = timeit.timeit(stmt2, setup, number=100000)
    t3 = timeit.timeit(stmt3, setup, number=100000)
    
    print(f"setdefault: {t1:.4f}秒")
    print(f"defaultdict: {t2:.4f}秒")
    print(f"手动检查: {t3:.4f}秒")
    print(f"defaultdict比手动检查快 {((t3-t2)/t3*100):.1f}%")

performance_comparison()

2. 内存使用优化

# 使用__slots__减少字典内存开销
class OptimizedUser:
    __slots__ = ('name', 'age', 'email')  # 固定属性，不使用__dict__
    
    def __init__(self, name, age, email):
        self.name = name
        self.age = age
        self.email = email

class RegularUser:
    def __init__(self, name, age, email):
        self.name = name
        self.age = age
        self.email = email

# 测试内存使用
import sys

opt_user = OptimizedUser("Alice", 25, "alice@example.com")
reg_user = RegularUser("Bob", 30, "bob@example.com")

print(f"优化类实例大小: {sys.getsizeof(opt_user)} 字节")
print(f"普通类实例大小: {sys.getsizeof(reg_user)} 字节")
print(f"普通类__dict__大小: {sys.getsizeof(reg_user.__dict__)} 字节")

六、应用场景决策流程图

七、综合实例：构建倒排索引

from collections import defaultdict, Counter
import re

def build_inverted_index(documents):
    """
    构建文档倒排索引
    返回格式：{单词: {文档ID: [位置列表]}}
    """
    index = defaultdict(lambda: defaultdict(list))
    
    for doc_id, text in enumerate(documents):
        # 分词并清理
        words = re.findall(r'\b\w+\b', text.lower())
        
        # 记录每个单词的位置
        for position, word in enumerate(words):
            index[word][doc_id].append(position)
    
    return index

def search_in_index(index, query):
    """
    在倒排索引中搜索查询词
    """
    results = defaultdict(list)
    query_words = re.findall(r'\b\w+\b', query.lower())
    
    for word in query_words:
        if word in index:
            for doc_id, positions in index[word].items():
                results[doc_id].extend([(word, pos) for pos in positions])
    
    # 按文档中匹配数量排序
    sorted_results = sorted(
        results.items(),
        key=lambda x: len(x[1]),
        reverse=True
    )
    
    return sorted_results

# 示例文档
documents = [
    "Python is a powerful programming language",
    "Python supports multiple programming paradigms",
    "Dictionary and set are important Python data structures",
    "Python dictionaries use hash tables for fast lookup"
]

# 构建索引
inverted_index = build_inverted_index(documents)

# 搜索
query = "Python dictionary hash"
search_results = search_in_index(inverted_index, query)

print("搜索查询:", query)
print("\n搜索结果:")
for doc_id, matches in search_results:
    print(f"文档 {doc_id}: {documents[doc_id]}")
    print(f"  匹配项: {matches}")
    print(f"  匹配数: {len(matches)}")
    print()

这个综合示例展示了如何结合使用defaultdict构建复杂的数据结构，利用字典和集合的高效查找特性实现全文搜索功能。倒排索引是搜索引擎的核心数据结构，充分体现了字典在构建映射关系方面的优势。

AtomGit开源社区

AtomGit 是由开放原子开源基金会联合 CSDN 等生态伙伴共同推出的新一代开源与人工智能协作平台。平台坚持“开放、中立、公益”的理念，把代码托管、模型共享、数据集托管、智能体开发体验和算力服务整合在一起，为开发者提供从开发、训练到部署的一站式体验。

更多推荐

艾络迅 × 荣耀：联合推出Meteer AI跳舞机器人玩具，智能科技重新定义儿童陪伴

AtomGit开源社区

面试必问：医学知识库 RAG 怎么设计？这次彻底讲透

医学知识库RAG设计需重点关注准确性、可追溯性和权限控制。医疗场景更适合RAG，因其知识具有明确来源、版本和时效性。关键设计点包括：谨慎文档切片（按章节/小节）、混合召回（关键词+向量）、带来源回答（引用片段+文档+版本）、权限前置处理（召回阶段过滤）。避免常见误区如切片过碎、无来源标记、知识更新不及时。实战中需结合指南检索、科室权限，确保回答准确可追溯。医疗RAG核心在于将知识正确、可追溯地提供