python 高效的list 去重方式

list 去重是编程中经常用到的,python 的去重方式很灵活,下面介绍多种去重方式,并比较得出最高效的方式。

python 高效的list 去重方式

直接贴出最后结果

1
2
3
list(set(seq))
# or
{}.fromkeys(seq).keys()

完整的比较测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from random import shuffle, randint
import re
from sets import Set

def f1(seq): # Raymond Hettinger
    # not order preserving
    set = {}
    map(set.__setitem__, seq, [])
    return set.keys()

    
def f2(seq):   # *********
    # order preserving
    checked = []
    for e in seq:
        if e not in checked:
            checked.append(e)
    return checked

def f3(seq):
    # Not order preserving
    keys = {}
    for e in seq:
        keys[e] = 1
    return keys.keys()

def f4(seq): # ********** order preserving
    noDupes = []
    [noDupes.append(i) for i in seq if not noDupes.count(i)]
    return noDupes

def f5(seq, idfun=None): # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result

def f5b(seq, idfun=None): # Alex Martelli ******* order preserving
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        # in old Python versions:
        # if seen.has_key(marker)
        # but in new ones:
        if marker not in seen:
            seen[marker] = 1
            result.append(item)
            
    return result

def f6(seq):
    # Not order preserving
    return list(Set(seq))

def f7(seq):
    # Not order preserving
    return list(set(seq))

def f8(seq): # Dave Kirby
    # Order preserving
    seen = set()
    return [x for x in seq if x not in seen and not seen.add(x)]

def f9(seq):
    # Not order preserving
    return {}.fromkeys(seq).keys()

def f10(seq, idfun=None): # Andrew Dalke
    # Order preserving
    return list(_f10(seq, idfun))

def _f10(seq, idfun=None):
    seen = set()
    if idfun is None:
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            yield x
    else:
        for x in seq:
            x = idfun(x)
            if x in seen:
                continue
            seen.add(x)
            yield x
            
            
def f11(seq): # f10 but simpler
    # Order preserving
    return list(_f10(seq))

def _f11(seq):
    seen = set()
    for x in seq:
        if x in seen:
            continue
        seen.add(x)
        yield x
            
import time

def timing(f, n, a):
    print f.__name__,
    r = range(n)
    t1 = time.clock()
    for i in r:
        f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a); f(a)
    t2 = time.clock()
    print round(t2-t1, 3)
    

def getRandomString(length=10, loweronly=1, numbersonly=0,
                    lettersonly=0):
    """ return a very random string """
    _letters = 'abcdefghijklmnopqrstuvwxyz'
    if numbersonly:
        l = list('0123456789')
    elif lettersonly:
        l = list(_letters + _letters.upper())
    else:
        lowercase = _letters+'0123456789'*2
        l = list(lowercase + lowercase.upper())
    shuffle(l)
    s = ''.join(l)
    if len(s) < length:
        s = s + getRandomString(loweronly=1)
    s = s[:length]
    if loweronly:
        return s.lower()
    else:
        return s

testdata = {}
for i in range(35):
    k = getRandomString(5, lettersonly=1)
    v = getRandomString(100 )
    testdata[k] = v
    
testdata = [int(x) for x in list('21354612')]
testdata += list('abcceeaa5efm')
class X:
    def __init__(self, n):
        self.foo = n
    def __repr__(self):
        return "<foo %r>"%self.foo
    def __cmp__(self, e):
        return cmp(self.foo, e.foo)
        
testdata = []
for i in range(10000):
    testdata.append(getRandomString(3, loweronly=True))
#testdata = ['f','g','c','d','b','a','a']

order_preserving = f2, f4, f5, f5b, f8, f10, f11
order_preserving = f5, f5b, f8, f10, f11

not_order_preserving = f1, f3, f6, f7, f9
testfuncs = order_preserving + not_order_preserving

for f in testfuncs:
    if f in order_preserving:
        print "*",
    timing(f, 100, testdata)

在我机器上运行结果

1
2
3
4
5
6
7
8
9
10
* f5 3.51
* f5b 3.49
* f8 2.02
* f10 2.57
* f11 2.54
f1 1.54
f3 1.25
f6 1.36
f7 0.89
f9 0.89

总结

下面两种最高效

1
2
3
4
5
6
7
def f7(seq):
    # Not order preserving
    return list(set(seq))

def f9(seq):
    # Not order preserving
    return {}.fromkeys(seq).keys()

去除文件\r\n

1
2
3
with open("file.txt") as rf, open("out.txt", "w") as wf:
    for line in rf:
        wf.write("%s\r\n" % line.strip())

本文网址: https://pylist.com/topic/75.html 转摘请注明来源

Suggested Topics

Python List 按键高效排序方法

Python含有许多古老的排序规则,这些规则在你创建定制的排序方法时会占用很多时间,而这些排序方法运行时也会拖延程序实际的运行速度。...

3行 Python 代码解简单的一元一次方程

一元一次方程:只含有一个未知数(即“元”),并且未知数的最高次数为1(即“次”)的整式方程叫做一元一次方程(英文名:`linear equation with one unknown`)。...

python 处理命令行参数

Python 完全支持创建在命令行运行的程序,也支持通过命令行参数和短长样式来指定各种选项。...

在 Ubuntu 16.04.6 LTS 系统上安装 Python 3.6.3

自己的阿里云一个 VPS 用的是系统 Ubuntu 16.04.6 LTS,自带的python版本是 `2.7.12` 与 `3.5.2`,有时候要用到 python `3.6`,又不想卸掉原来版本。下面介绍安装 python 3.6.3 的过程,因为版本较旧,遇到一些坑,这里记录一下。...

Leave a Comment