collections模块
collections 模块提供了 Python 通用内置容器(dict、list、set、tuple)的替代选择,以及专门化的容器数据类型。这些类型在特定场景下比普通内置类型更高效或更方便。
Counter:计数器
Counter 是字典的子类,用于计数可哈希对象:
from collections import Counter
# 统计字符频率
text = "hello world"
counts = Counter(text)
print counts # Counter({'l': 3, 'o': 2, 'h': 1, 'e': 1, ' ': 1, 'w': 1, 'r': 1, 'd': 1})
print counts['l'] # 3
print counts.most_common(3) # [('l', 3), ('o', 2), ('h', 1)] —— 最常见的 3 个
# 统计单词
words = "the quick brown fox jumps over the lazy dog".split()
word_counts = Counter(words)
print word_counts['the'] # 2
Counter 的运算:
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
print c1 + c2 # Counter({'a': 4, 'b': 3}) —— 相加
print c1 - c2 # Counter({'a': 2}) —— 相减(只保留正数)
print c1 & c2 # Counter({'a': 1, 'b': 1}) —— 取最小
print c1 | c2 # Counter({'a': 3, 'b': 2}) —— 取最大
defaultdict:默认字典
defaultdict 在访问不存在的键时自动创建默认值,避免 KeyError:
from collections import defaultdict
# 普通字典需要手动处理
groups = {}
for key, value in [('a', 1), ('b', 2), ('a', 3)]:
if key not in groups:
groups[key] = []
groups[key].append(value)
# defaultdict 自动创建列表
groups = defaultdict(list)
for key, value in [('a', 1), ('b', 2), ('a', 3)]:
groups[key].append(value)
print dict(groups) # {'a': [1, 3], 'b': [2]}
常用默认值工厂:
from collections import defaultdict
# 默认整数(计数)
counts = defaultdict(int)
counts['a'] += 1
print counts['a'] # 1
print counts['b'] # 0(自动创建)
# 默认集合
tags = defaultdict(set)
tags['python'].add('programming')
tags['python'].add('language')
print dict(tags) # {'python': set(['language', 'programming'])}
# 默认字典
nested = defaultdict(dict)
nested['user']['name'] = 'Alice'
print dict(nested) # {'user': {'name': 'Alice'}}
OrderedDict:有序字典
OrderedDict 保持键的插入顺序:
from collections import OrderedDict
od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3
for key, value in od.items():
print key, value
# first 1
# second 2
# third 3
普通 dict 在 Python 2.7 中不保证顺序(虽然实现上通常有序,但不应依赖)。
实际应用:LRU 缓存
from collections import OrderedDict
class LRUCache(object):
def __init__(self, capacity):
self.capacity = capacity
self.cache = OrderedDict()
def get(self, key):
if key not in self.cache:
return -1
# 移动到末尾(最近使用)
value = self.cache.pop(key)
self.cache[key] = value
return value
def put(self, key, value):
if key in self.cache:
self.cache.pop(key)
elif len(self.cache) >= self.capacity:
# 移除最旧的
self.cache.popitem(last=False)
self.cache[key] = value
namedtuple:命名元组
namedtuple 创建带有命名字段的元组子类,让元组像对象一样访问:
from collections import namedtuple
Point = namedtuple('Point', ['x', 'y'])
p = Point(11, y=22)
print p.x # 11
print p.y # 22
print p[0] # 11 —— 仍然是元组,支持索引
print p # Point(x=11, y=22)
与普通类的对比:
# namedtuple:不可变,内存小,性能好
Point = namedtuple('Point', ['x', 'y'])
p = Point(1, 2)
# 普通类:可变,功能多
class PointClass(object):
def __init__(self, x, y):
self.x = x
self.y = y
pc = PointClass(1, 2)
namedtuple 适合存储纯数据(Data Transfer Object),不可变且内存占用小。
实际应用:
from collections import namedtuple
# 定义数据结构
Employee = namedtuple('Employee', ['name', 'id', 'department'])
employees = [
Employee('Alice', 'E001', 'Engineering'),
Employee('Bob', 'E002', 'Sales'),
]
for emp in employees:
print "%s (%s) works in %s" % (emp.name, emp.id, emp.department)
deque:双端队列
deque(double-ended queue)支持两端高效添加和移除元素:
from collections import deque
d = deque([1, 2, 3])
d.append(4) # 右端添加
d.appendleft(0) # 左端添加
print list(d) # [0, 1, 2, 3, 4]
print d.pop() # 4 —— 右端移除
print d.popleft() # 0 —— 左端移除
与列表的性能对比:
| 操作 | list | deque |
|---|---|---|
| append | O(1) | O(1) |
| appendleft | O(n) | O(1) |
| pop | O(1) | O(1) |
| popleft | O(n) | O(1) |
列表在头部插入/删除需要移动所有元素,是 O(n)。deque 两端都是 O(1)。
实际应用:滑动窗口
from collections import deque
def moving_average(iterable, n=3):
it = iter(iterable)
d = deque(maxlen=n)
for x in it:
d.append(x)
if len(d) == n:
yield sum(d) / float(n)
for avg in moving_average([40, 30, 50, 46, 39, 44]):
print avg
# 40.0, 42.0, 45.0, 43.0
maxlen 限制队列大小,超出时自动从另一端移除。
选择指南
| 需求 | 推荐类型 |
|---|---|
| 计数 | Counter |
| 自动创建默认值 | defaultdict |
| 保持插入顺序 | OrderedDict |
| 纯数据对象 | namedtuple |
| 两端频繁操作 | deque |
| 一般键值存储 | 普通 dict |