itertools模块
itertools 是 Python 标准库中处理迭代器的工具箱。它提供了高效的迭代器构建块,用于组合、过滤、分组、映射迭代器。这些工具都是惰性求值的,适合处理大数据流。
无限迭代器
count:从指定值开始无限计数
import itertools
for i in itertools.count(10, 2): # 从 10 开始,步长 2
if i > 20:
break
print i,
# 输出:10 12 14 16 18 20
cycle:无限循环遍历一个可迭代对象
counter = 0
for color in itertools.cycle(["red", "green", "blue"]):
print color,
counter += 1
if counter >= 6:
break
# 输出:red green blue red green blue
repeat:重复一个值无限次或指定次数
print list(itertools.repeat("A", 5)) # ['A', 'A', 'A', 'A', 'A']
# 与 map 结合
print list(map(pow, range(10), itertools.repeat(2)))
# [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] —— 0~9 的平方
有限迭代器
chain:连接多个可迭代对象
print list(itertools.chain([1, 2], [3, 4], [5, 6]))
# [1, 2, 3, 4, 5, 6]
# 从嵌套结构扁平化
print list(itertools.chain.from_iterable([[1, 2], [3, 4]]))
# [1, 2, 3, 4]
islice:切片迭代器(惰性)
# 前 5 个
print list(itertools.islice(itertools.count(), 5))
# [0, 1, 2, 3, 4]
# 从第 5 到第 10 个
print list(itertools.islice(itertools.count(), 5, 10))
# [5, 6, 7, 8, 9]
# 步长 2
print list(itertools.islice(itertools.count(), 0, 10, 2))
# [0, 2, 4, 6, 8]
tee:复制迭代器为多个独立迭代器
it1, it2 = itertools.tee([1, 2, 3, 4], 2)
print list(it1) # [1, 2, 3, 4]
print list(it2) # [1, 2, 3, 4]
注意:tee 会在内部存储已消耗的元素,如果迭代器很大,会占用内存。
groupby:按 key 函数分组(要求输入已按 key 排序)
data = [("A", 1), ("A", 2), ("B", 3), ("B", 4), ("A", 5)]
# 错误!groupby 要求相同 key 连续
# 正确:先排序
data = sorted(data, key=lambda x: x[0])
for key, group in itertools.groupby(data, key=lambda x: x[0]):
print key, list(group)
# A [(A, 1), (A, 2)]
# B [(B, 3), (B, 4)]
# A [(A, 5)]
组合迭代器
product:笛卡尔积
print list(itertools.product([1, 2], ["A", "B"]))
# [(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')]
# 重复笛卡尔积
print list(itertools.product([1, 2], repeat=2))
# [(1, 1), (1, 2), (2, 1), (2, 2)]
permutations:排列
print list(itertools.permutations([1, 2, 3], 2))
# [(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]
combinations:组合(无序)
print list(itertools.combinations([1, 2, 3, 4], 2))
# [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
combinations_with_replacement:可重复组合
print list(itertools.combinations_with_replacement([1, 2, 3], 2))
# [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
过滤与映射
compress:按选择器过滤
print list(itertools.compress(["A", "B", "C", "D"], [1, 0, 1, 0]))
# ['A', 'C']
dropwhile:跳过满足条件的元素,之后全部保留
print list(itertools.dropwhile(lambda x: x < 5, [1, 3, 5, 7, 2, 4]))
# [5, 7, 2, 4] —— 5 开始保留,后面的 2, 4 也保留
takewhile:保留满足条件的元素,遇到不满足的停止
print list(itertools.takewhile(lambda x: x < 5, [1, 3, 5, 7, 2, 4]))
# [1, 3] —— 遇到 5 停止,后面的不处理
filterfalse:保留不满足条件的元素(与内置 filter 相反)
print list(itertools.filterfalse(lambda x: x % 2 == 0, [1, 2, 3, 4, 5]))
# [1, 3, 5]
实际应用
滑动窗口:
def window(iterable, n):
it = iter(iterable)
win = list(itertools.islice(it, n))
if len(win) == n:
yield tuple(win)
for x in it:
win = win[1:] + [x]
yield tuple(win)
print list(window([1, 2, 3, 4, 5], 3))
# [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
扁平化嵌套列表:
def flatten(nested):
for item in nested:
if isinstance(item, list):
for subitem in flatten(item):
yield subitem
else:
yield item
# 或用 itertools
print list(flatten([1, [2, [3, 4]], 5]))
# [1, 2, 3, 4, 5]
轮询调度:
def round_robin(*iterables):
"""轮流从多个迭代器取元素。"""
pending = len(iterables)
nexts = itertools.cycle(iter(it).next for it in iterables)
while pending:
try:
for next in nexts:
yield next()
except StopIteration:
pending -= 1
nexts = itertools.cycle(itertools.islice(nexts, pending))
print list(round_robin("ABC", "D", "EF"))
# ['A', 'D', 'E', 'B', 'F', 'C']
性能对比
import timeit
# 列表推导式
t1 = timeit.timeit("sum([x * x for x in range(10000)])", number=1000)
# 生成器表达式
t2 = timeit.timeit("sum(x * x for x in range(10000))", number=1000)
# itertools
t3 = timeit.timeit("sum(itertools.imap(lambda x: x * x, range(10000)))",
"import itertools", number=1000)
print t1, t2, t3 # 列表最慢,生成器和 itertools 接近
itertools 的函数用 C 实现,通常比纯 Python 的生成器表达式更快。