bktree.py
author adam@hupp.org
Thu Nov 01 20:49:00 2007 -0400 (10 months ago)
changeset 8 b80fee613e45
parent 6314ef7bb37b3
permissions -rw-r--r--
- comment updates
- final touches to hs version
1 """
2
3 This module implements Burkhard-Keller Trees (bk-tree). bk-trees
4 allow fast lookup of words that lie within a specified distance of a
5 query word. For example, this might be used by a spell checker to
6 find near matches to a mispelled word.
7
8 The implementation is based on the description in this article:
9
10 http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
11
12 Licensed under the PSF license: http://www.python.org/psf/license/
13
14 - Adam Hupp <adam@hupp.org>
15
16 """
17 from itertools import imap, ifilter
18
19
20 class BKTree:
21 def __init__(self, distfn, words):
22 """
23 Create a new BK-tree from the given distance function and
24 words.
25
26 Arguments:
27
28 distfn: a binary function that returns the distance between
29 two words. Return value is a non-negative integer. the
30 distance function must be a metric space.
31
32 words: an iterable. produces values that can be passed to
33 distfn
34
35 """
36 self.distfn = distfn
37
38 it = iter(words)
39 root = it.next()
40 self.tree = (root, {})
41
42 for i in it:
43 self._add_word(self.tree, i)
44
45 def _add_word(self, parent, word):
46 pword, children = parent
47 d = self.distfn(word, pword)
48 if d in children:
49 self._add_word(children[d], word)
50 else:
51 children[d] = (word, {})
52
53 def query(self, word, n):
54 """
55 Return all words in the tree that are within a distance of `n'
56 from `word`.
57
58 Arguments:
59
60 word: a word to query on
61
62 n: a non-negative integer that specifies the allowed distance
63 from the query word.
64
65 Return value is a list of tuples (distance, word), sorted in
66 ascending order of distance.
67
68 """
69 def rec(parent):
70 pword, children = parent
71 d = self.distfn(word, pword)
72 results = []
73 if d <= n:
74 results.append( (d, pword) )
75
76 for i in range(d-n, d+n+1):
77 child = children.get(i)
78 if child is not None:
79 results.extend(rec(child))
80 return results
81
82 # sort by distance
83 return sorted(rec(self.tree))
84
85
86
87 def brute_query(word, words, distfn, n):
88 """A brute force distance query
89
90 Arguments:
91
92 word: the word to query for
93
94 words: a iterable that produces words to test
95
96 distfn: a binary function that returns the distance between a
97 `word' and an item in `words'.
98
99 n: an integer that specifies the distance of a matching word
100
101 """
102 return [i for i in words
103 if distfn(i, word) <= n]
104
105
106 def maxdepth(tree, count=0):
107 _, children = t
108 if len(children):
109 return max(maxdepth(i, c+1) for i in children.values())
110 else:
111 return c
112
113
114 # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Python
115 def levenshtein(s, t):
116 m, n = len(s), len(t)
117 d = [range(n+1)]
118 d += [[i] for i in range(1,m+1)]
119 for i in range(0,m):
120 for j in range(0,n):
121 cost = 1
122 if s[i] == t[j]: cost = 0
123
124 d[i+1].append( min(d[i][j+1]+1, # deletion
125 d[i+1][j]+1, #insertion
126 d[i][j]+cost) #substitution
127 )
128 return d[m][n]
129
130
131 def dict_words(dictfile="/usr/share/dict/american-english"):
132 "Return an iterator that produces words in the given dictionary."
133 return ifilter(len,
134 imap(str.strip,
135 open(dictfile)))
136
137
138 def timeof(fn, *args):
139 import time
140 t = time.time()
141 res = fn(*args)
142 print "time: ", (time.time() - t)
143 return res
144
145
146
147 if __name__ == "__main__":
148
149 tree = BKTree(levenshtein,
150 dict_words('/usr/share/dict/american-english-large'))
151
152 print tree.query("ricoshet", 2)
153
154 # dist = 1
155 # for i in ["book", "cat", "backlash", "scandal"]:
156 # w = set(tree.query(i, dist)) - set([i])
157 # print "words within %d of %s: %r" % (dist, i, w)