Spelling correction using the Python Natural Language Toolkit (nltk)
http://www.biais.org/blog/index.php/2007/01/31/25-spelling-correction-using-the-python-natural-language-toolkit-nltk
Outputs:
birdd - Did you mean "birds" ? (or "bird") oklaoma - Did you mean "oklahoma" ? emphasise - Did you mean "emphasize" ? (or "emphasizes", "emphasizing") bird - This word seems OK carot - I can't found it in my learned db
Here is the class:
from nltk_lite.stem.porter import Porter from nltk_lite.corpora import brown from nltk_lite import tokenize import sys from collections import defaultdict import operator def sortby(nlist ,n, reverse=0): nlist.sort(key=operator.itemgetter(n), reverse=reverse) class mydict(dict): def __missing__(self, key): return 0 class DidYouMean: def __init__(self): self.stemmer = Porter() def specialhash(self, s): s = s.lower() s = s.replace("z", "s") s = s.replace("h", "") for i in [chr(ord("a") + i) for i in range(26)]: s = s.replace(i+i, i) s = self.stemmer.stem(s) return s def test(self, token): hashed = self.specialhash(token) if hashed in self.learned: words = self.learned[hashed].items() sortby(words, 1, reverse=1) if token in [i[0] for i in words]: return 'This word seems OK' else: if len(words) == 1: return 'Did you mean "%s" ?' % words[0][0] else: return 'Did you mean "%s" ? (or %s)' \ % (words[0][0], ", ".join(['"'+i[0]+'"' \ for i in words[1:]])) return "I can't found similar word in my learned db" def learn(self, listofsentences=[], n=2000): self.learned = defaultdict(mydict) if listofsentences == []: listofsentences = brown.raw() for i, sent in enumerate(listofsentences): if i >= n: # Limit to the first nth sentences of the corpus break for word in sent: self.learned[self.specialhash(word)][word.lower()] += 1 def demo(): d = DidYouMean() d.learn() # choice of words to be relevant related to the brown corpus for i in "birdd, oklaoma, emphasise, bird, carot".split(", "): print i, "-", d.test(i) if __name__ == "__main__": demo()