BKTree.hs
author adam@hupp.org
Fri Nov 02 09:01:31 2007 -0400 (10 months ago)
changeset 13 f5790a47b803
parent 126fb16e83b736
permissions -rw-r--r--
fix misspelling
1 {--
2
3 An implementation of Burkhard-Keller trees. These allow fast lookup
4 of words within a certain distance of a query word.
5
6 The implementation is based on the description in this article:
7
8 http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
9
10
11 - Adam Hupp <adam@hupp.org>
12
13 --}
14
15 module BKTree ( queryTree,
16 mkTree,
17 levenshtein,
18 main
19 ) where
20
21
22 import List
23 import qualified Data.Map as Map
24
25 type Children a = Map.Map Int (TreeNode a)
26
27 data TreeNode a = TreeNode a (Children a) deriving Show
28
29 data Tree a = Tree (DistFunc a) (TreeNode a)
30
31 type DistFunc a = a -> a -> Int
32
33
34
35
36 addWord :: DistFunc a -> TreeNode a -> a -> TreeNode a
37 addWord distfn (TreeNode word kids) w =
38 let distance = distfn word w
39 newSubTree = if Map.member distance kids then
40 addWord distfn (kids Map.! distance) w
41 else
42 TreeNode w Map.empty
43 childUpdate = Map.insert distance newSubTree kids
44 in
45 TreeNode word childUpdate
46
47
48
49
50 queryTreeNode :: DistFunc a -> a -> Int -> TreeNode a -> [(Int, a)]
51 queryTreeNode distfunc qword n (TreeNode word kids) =
52 let dist = distfunc word qword
53 range = [dist-n..dist+n+1]
54 kidsInRange = mapSelect kids range
55 childResults = concatMap (queryTreeNode distfunc qword n) kidsInRange
56 in if dist <= n then
57 (dist, word) : childResults
58 else
59 childResults
60
61 compareBy :: Ord b => (a -> b) -> a -> a -> Ordering
62 compareBy by lhs rhs = compare (by lhs) (by rhs)
63
64 mapSelect :: Ord k => Map.Map k v -> [k] -> [v]
65 mapSelect mp (x:xs) =
66 if Map.member x mp then
67 (mp Map.! x ) : mapSelect mp xs
68 else mapSelect mp xs
69 mapSelect _ [] = []
70
71 mkTree :: DistFunc a -> [a] -> Tree a
72 mkTree distfn (w:ws) =
73 let root = TreeNode w Map.empty
74 tree = foldl (addWord distfn) root ws
75 in
76 Tree distfn tree
77
78
79 queryTree :: Tree a -> a -> Int -> [(Int,a)]
80 queryTree (Tree distfn root) qword n =
81 sortBy (compareBy fst) $ queryTreeNode distfn qword n root
82
83 main = do dictFile <- readFile "/usr/share/dict/american-english"
84 words <- return $ lines dictFile
85 tree <- return $ mkTree levenshtein words
86 putStrLn (show (queryTree tree "the" 2))
87
88 -- http://www.cse.unsw.edu.au/~dons/code/lambdabot/Lib/Util.hs
89 -- | Levenshtein edit-distance algorithm
90 -- Translated from an Erlang version by Fredrik Svensson and Adam Lindberg
91 --
92 levenshtein :: String -> String -> Int
93 levenshtein [] [] = 0
94 levenshtein s [] = length s
95 levenshtein [] s = length s
96 levenshtein s t = lvn s t [0..length t] 1
97
98 lvn :: String -> String -> [Int] -> Int -> Int
99 lvn [] _ dl _ = last dl
100 lvn (s:ss) t dl n = lvn ss t (lvn' t dl s [n] n) (n + 1)
101
102 lvn' :: String -> [Int] -> Char -> [Int] -> Int -> [Int]
103 lvn' [] _ _ ndl _ = ndl
104 lvn' (t:ts) (dlh:dlt) c ndl ld | length dlt > 0 = lvn' ts dlt c (ndl ++ [m]) m
105 where
106 m = foldl1 min [ld + 1, head dlt + 1, dlh + (dif t c)]
107 lvn' _ _ _ _ _ = error "levenshtein, ran out of numbers"
108
109 dif :: Char -> Char -> Int
110 dif = (fromEnum .) . (/=)