collections模块

collections 模块提供了 Python 通用内置容器（dict、list、set、tuple）的替代选择，以及专门化的容器数据类型。这些类型在特定场景下比普通内置类型更高效或更方便。

Counter：计数器

Counter 是字典的子类，用于计数可哈希对象：

from collections import Counter

# 统计字符频率
text = "hello world"
counts = Counter(text)
print counts              # Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, ' ': 1, 'w': 1, 'r': 1, 'd': 1})
print counts['l']         # 3
print counts.most_common(3)   # [('l', 3), ('o', 2), ('h', 1)] —— 最常见的 3 个

# 统计单词
words = "the quick brown fox jumps over the lazy dog".split()
word_counts = Counter(words)
print word_counts['the']      # 2

Counter 的运算：

c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)

print c1 + c2             # Counter({'a': 4, 'b': 3}) —— 相加
print c1 - c2             # Counter({'a': 2}) —— 相减（只保留正数）
print c1 & c2             # Counter({'a': 1, 'b': 1}) —— 取最小
print c1 | c2             # Counter({'a': 3, 'b': 2}) —— 取最大

defaultdict：默认字典

defaultdict 在访问不存在的键时自动创建默认值，避免 KeyError：

from collections import defaultdict

# 普通字典需要手动处理
groups = {}
for key, value in [('a', 1), ('b', 2), ('a', 3)]:
    if key not in groups:
        groups[key] = []
    groups[key].append(value)

# defaultdict 自动创建列表
groups = defaultdict(list)
for key, value in [('a', 1), ('b', 2), ('a', 3)]:
    groups[key].append(value)

print dict(groups)        # {'a': [1, 3], 'b': [2]}

常用默认值工厂：

from collections import defaultdict

# 默认整数（计数）
counts = defaultdict(int)
counts['a'] += 1
print counts['a']         # 1
print counts['b']         # 0（自动创建）

# 默认集合
tags = defaultdict(set)
tags['python'].add('programming')
tags['python'].add('language')
print dict(tags)          # {'python': set(['language', 'programming'])}

# 默认字典
nested = defaultdict(dict)
nested['user']['name'] = 'Alice'
print dict(nested)        # {'user': {'name': 'Alice'}}

OrderedDict：有序字典

OrderedDict 保持键的插入顺序：

from collections import OrderedDict

od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3

for key, value in od.items():
    print key, value
# first 1
# second 2
# third 3

普通 dict 在 Python 2.7 中不保证顺序（虽然实现上通常有序，但不应依赖）。

实际应用：LRU 缓存

from collections import OrderedDict

class LRUCache(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.cache = OrderedDict()
    
    def get(self, key):
        if key not in self.cache:
            return -1
        # 移动到末尾（最近使用）
        value = self.cache.pop(key)
        self.cache[key] = value
        return value
    
    def put(self, key, value):
        if key in self.cache:
            self.cache.pop(key)
        elif len(self.cache) >= self.capacity:
            # 移除最旧的
            self.cache.popitem(last=False)
        self.cache[key] = value

namedtuple：命名元组

namedtuple 创建带有命名字段的元组子类，让元组像对象一样访问：

from collections import namedtuple

Point = namedtuple('Point', ['x', 'y'])
p = Point(11, y=22)

print p.x             # 11
print p.y             # 22
print p[0]            # 11 —— 仍然是元组，支持索引
print p               # Point(x=11, y=22)

与普通类的对比：

# namedtuple：不可变，内存小，性能好
Point = namedtuple('Point', ['x', 'y'])
p = Point(1, 2)

# 普通类：可变，功能多
class PointClass(object):
    def __init__(self, x, y):
        self.x = x
        self.y = y

pc = PointClass(1, 2)

namedtuple 适合存储纯数据（Data Transfer Object），不可变且内存占用小。

实际应用：

from collections import namedtuple

# 定义数据结构
Employee = namedtuple('Employee', ['name', 'id', 'department'])

employees = [
    Employee('Alice', 'E001', 'Engineering'),
    Employee('Bob', 'E002', 'Sales'),
]

for emp in employees:
    print "%s (%s) works in %s" % (emp.name, emp.id, emp.department)

deque：双端队列

deque（double-ended queue）支持两端高效添加和移除元素：

from collections import deque

d = deque([1, 2, 3])
d.append(4)               # 右端添加
d.appendleft(0)         # 左端添加
print list(d)           # [0, 1, 2, 3, 4]

print d.pop()           # 4 —— 右端移除
print d.popleft()       # 0 —— 左端移除

与列表的性能对比：

操作	list	deque
append	O(1)	O(1)
appendleft	O(n)	O(1)
pop	O(1)	O(1)
popleft	O(n)	O(1)

列表在头部插入/删除需要移动所有元素，是 O(n)。deque 两端都是 O(1)。

实际应用：滑动窗口

from collections import deque

def moving_average(iterable, n=3):
    it = iter(iterable)
    d = deque(maxlen=n)
    
    for x in it:
        d.append(x)
        if len(d) == n:
            yield sum(d) / float(n)

for avg in moving_average([40, 30, 50, 46, 39, 44]):
    print avg
# 40.0, 42.0, 45.0, 43.0

maxlen 限制队列大小，超出时自动从另一端移除。

选择指南

需求	推荐类型
计数	`Counter`
自动创建默认值	`defaultdict`
保持插入顺序	`OrderedDict`
纯数据对象	`namedtuple`
两端频繁操作	`deque`
一般键值存储	普通 `dict`