python 高效的list 去重方式

list 去重是编程中经常用到的,python 的去重方式很灵活,下面介绍多种去重方式,并比较得出最高效的方式。

python 高效的list 去重方式

直接贴出最后结果

1
2
3
list(set(seq))
# or
{}.fromkeys(seq).keys()

完整的比较测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from random import shuffle, randint
import re
from sets import Set

def f1(seq): # Raymond Hettinger
    # not order preserving
    set = {}
    map(set.__setitem__, seq, [])
    return set.keys()

    
def f2(seq):   # *********
    # order preserving
    checked = []
    for e in seq:
        if e not in checked:
            checked.append(e)
    return checked

def f3(seq):
    # Not order preserving
    keys = {}
    for e in seq:
        keys[e] = 1
    return keys.keys()

def f4(seq): # ********** order preserving
    noDupes = []
    [noDupes.append(i) for i in seq if not noDupes.count(i)]
    return noDupes

def f5(seq, idfun=None): # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result

def f5b(seq, idfun=None): # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker not in seen:
            seen[marker] = 1
            result.append(item)
            
    return result

def f6(seq):
    # Not order preserving
    return list(Set(seq))

def f7(seq):
    # Not order preserving
    return list(set(seq))

def f8(seq): # Dave Kirby
    # Order preserving
    seen = set()
    return [x for x in seq if x not in seen and not seen.add(x)]

def f9(seq):
    # Not order preserving
    return {}.fromkeys(seq).keys()

def f10(seq, idfun=None): # Andrew Dalke
    # Order preserving
    return list(_f10(seq, idfun))

def _f10(seq, idfun=None):
    seen = set()
    if idfun is None:
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            yield x
    else:
        for x in seq:
            x = idfun(x)
            if x in seen:
                continue
            seen.add(x)
            yield x
            
            
def f11(seq): # f10 but simpler
    # Order preserving
    return list(_f10(seq))

def _f11(seq):
    seen = set()
    for x in seq:
        if x in seen:
            continue
        seen.add(x)
        yield x
            
import time

def timing(f, n, a):
    print f.__name__,
    r = range(n)
    t1 = time.clock()
    for i in r:
        f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a)
    t2 = time.clock()
    print round(t2-t1, 3)
    

def getRandomString(length=10, loweronly=1, numbersonly=0,
                    lettersonly=0):
    """ return a very random string """
    _letters = 'abcdefghijklmnopqrstuvwxyz'
    if numbersonly:
        l = list('0123456789')
    elif lettersonly:
        l = list(_letters + _letters.upper())
    else:
        lowercase = _letters+'0123456789'*2
        l = list(lowercase + lowercase.upper())
    shuffle(l)
    s = ''.join(l)
    if len(s) < length:
        s = s + getRandomString(loweronly=1)
    s = s[:length]
    if loweronly:
        return s.lower()
    else:
        return s

testdata = {}
for i in range(35):
    k = getRandomString(5, lettersonly=1)
    v = getRandomString(100 )
    testdata[k] = v
    
testdata = [int(x) for x in list('21354612')]
testdata += list('abcceeaa5efm')
class X:
    def __init__(self, n):
        self.foo = n
    def __repr__(self):
        return "<foo %r>"%self.foo
    def __cmp__(self, e):
        return cmp(self.foo, e.foo)
        
testdata = []
for i in range(10000):
    testdata.append(getRandomString(3, loweronly=True))
#testdata = ['f','g','c','d','b','a','a']

order_preserving = f2, f4, f5, f5b, f8, f10, f11
order_preserving = f5, f5b, f8, f10, f11

not_order_preserving = f1, f3, f6, f7, f9
testfuncs = order_preserving + not_order_preserving

for f in testfuncs:
    if f in order_preserving:
        print "*",
    timing(f, 100, testdata)

在我机器上运行结果

1
2
3
4
5
6
7
8
9
10
* f5 3.51
* f5b 3.49
* f8 2.02
* f10 2.57
* f11 2.54
f1 1.54
f3 1.25
f6 1.36
f7 0.89
f9 0.89

总结

下面两种最高效

1
2
3
4
5
6
7
def f7(seq):
    # Not order preserving
    return list(set(seq))

def f9(seq):
    # Not order preserving
    return {}.fromkeys(seq).keys()

去除文件\r\n

1
2
3
with open("file.txt") as rf, open("out.txt", "w") as wf:
    for line in rf:
        wf.write("%s\r\n" % line.strip())

本文网址: https://pylist.com/topic/75.html 转摘请注明来源

Suggested Topics

Python List 按键高效排序方法

Python含有许多古老的排序规则,这些规则在你创建定制的排序方法时会占用很多时间,而这些排序方法运行时也会拖延程序实际的运行速度。...

python 解析电子书的信息

epub 书是可供人们下载的开放性资源格式的电子图书。epub 文件通常与类似亚马逊Kindle 这样的电子阅读器不兼容。...

python 使用 magic 从文件内容判断文件类型

使用 python-magic 库可以轻松识别文件的类型,python-magic是libmagic文件类型识别库的python接口。libmagic通过根据预定义的文件类型列表检查它们的头文件来识别文件类型。 ...

一个简单高效的LRU 缓存,golang 实现

LRU(Least recently used,最近最少使用)是根据数据的历史访问记录来进行淘汰数据,其核心思想是“如果数据最近被访问过,那么将来被访问的几率也更高”。...

Leave a Comment