Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Spelling correction using the Python Natural Language Toolkit (nltk) (See related posts)

Google "Did you mean"-like. More here:
http://www.biais.org/blog/index.php/2007/01/31/25-spelling-correction-using-the-python-natural-language-toolkit-nltk


Outputs:
   1  
   2  birdd - Did you mean "birds" ? (or "bird")
   3  oklaoma - Did you mean "oklahoma" ?
   4  emphasise - Did you mean "emphasize" ? (or "emphasizes", "emphasizing")
   5  bird - This word seems OK
   6  carot - I can't found it in my learned db


Here is the class:
   1  
   2  from nltk_lite.stem.porter import Porter
   3  from nltk_lite.corpora import brown
   4  from nltk_lite import tokenize
   5   
   6  import sys
   7  from collections import defaultdict
   8  import operator
   9   
  10  def sortby(nlist ,n, reverse=0):
  11      nlist.sort(key=operator.itemgetter(n), reverse=reverse)
  12   
  13  class mydict(dict):
  14      def __missing__(self, key):
  15          return 0
  16   
  17  class DidYouMean:
  18      def __init__(self):
  19          self.stemmer = Porter()
  20   
  21      def specialhash(self, s):
  22          s = s.lower()
  23          s = s.replace("z", "s")
  24          s = s.replace("h", "")
  25          for i in [chr(ord("a") + i) for i in range(26)]:
  26              s = s.replace(i+i, i)
  27          s = self.stemmer.stem(s)
  28          return s
  29   
  30      def test(self, token):
  31          hashed = self.specialhash(token)
  32          if hashed in self.learned:
  33              words = self.learned[hashed].items()
  34              sortby(words, 1, reverse=1)
  35              if token in [i[0] for i in words]:
  36                  return 'This word seems OK'
  37              else:
  38                  if len(words) == 1:
  39                      return 'Did you mean "%s" ?' % words[0][0]
  40                  else:
  41                      return 'Did you mean "%s" ? (or %s)' \
  42                             % (words[0][0], ", ".join(['"'+i[0]+'"' \
  43                                                        for i in words[1:]]))
  44          return "I can't found similar word in my learned db"
  45   
  46      def learn(self, listofsentences=[], n=2000):
  47          self.learned = defaultdict(mydict)
  48          if listofsentences == []:
  49              listofsentences = brown.raw()
  50          for i, sent in enumerate(listofsentences):
  51              if i >= n: # Limit to the first nth sentences of the corpus
  52                  break
  53              for word in sent:
  54                  self.learned[self.specialhash(word)][word.lower()] += 1
  55   
  56  def demo():
  57      d = DidYouMean()
  58      d.learn()
  59      # choice of words to be relevant related to the brown corpus
  60      for i in "birdd, oklaoma, emphasise, bird, carot".split(", "):
  61          print i, "-", d.test(i)
  62   
  63  if __name__ == "__main__":
  64      demo()

You need to create an account or log in to post comments to this site.


Click here to browse all 5545 code snippets

Related Posts