http://www.biais.org/blog/index.php/2007/01/31/25-spelling-correction-using-the-python-natural-language-toolkit-nltk
Outputs:
1 2 birdd - Did you mean "birds" ? (or "bird") 3 oklaoma - Did you mean "oklahoma" ? 4 emphasise - Did you mean "emphasize" ? (or "emphasizes", "emphasizing") 5 bird - This word seems OK 6 carot - I can't found it in my learned db
Here is the class:
1 2 from nltk_lite.stem.porter import Porter 3 from nltk_lite.corpora import brown 4 from nltk_lite import tokenize 5 6 import sys 7 from collections import defaultdict 8 import operator 9 10 def sortby(nlist ,n, reverse=0): 11 nlist.sort(key=operator.itemgetter(n), reverse=reverse) 12 13 class mydict(dict): 14 def __missing__(self, key): 15 return 0 16 17 class DidYouMean: 18 def __init__(self): 19 self.stemmer = Porter() 20 21 def specialhash(self, s): 22 s = s.lower() 23 s = s.replace("z", "s") 24 s = s.replace("h", "") 25 for i in [chr(ord("a") + i) for i in range(26)]: 26 s = s.replace(i+i, i) 27 s = self.stemmer.stem(s) 28 return s 29 30 def test(self, token): 31 hashed = self.specialhash(token) 32 if hashed in self.learned: 33 words = self.learned[hashed].items() 34 sortby(words, 1, reverse=1) 35 if token in [i[0] for i in words]: 36 return 'This word seems OK' 37 else: 38 if len(words) == 1: 39 return 'Did you mean "%s" ?' % words[0][0] 40 else: 41 return 'Did you mean "%s" ? (or %s)' \ 42 % (words[0][0], ", ".join(['"'+i[0]+'"' \ 43 for i in words[1:]])) 44 return "I can't found similar word in my learned db" 45 46 def learn(self, listofsentences=[], n=2000): 47 self.learned = defaultdict(mydict) 48 if listofsentences == []: 49 listofsentences = brown.raw() 50 for i, sent in enumerate(listofsentences): 51 if i >= n: # Limit to the first nth sentences of the corpus 52 break 53 for word in sent: 54 self.learned[self.specialhash(word)][word.lower()] += 1 55 56 def demo(): 57 d = DidYouMean() 58 d.learn() 59 # choice of words to be relevant related to the brown corpus 60 for i in "birdd, oklaoma, emphasise, bird, carot".split(", "): 61 print i, "-", d.test(i) 62 63 if __name__ == "__main__": 64 demo()