Fast stop word detection in Ruby
List of stop words obtained from http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words
1 # Detect stop words QUICKLY 2 # Uses a bloom filter instead of searching literally through a list of stopwords 3 # for > 3x speed increase 4 # 5 # using bloom filter: 2.580000 0.030000 2.610000 ( 2.698829) 6 # using literal search: 7.850000 0.120000 7.970000 ( 8.181684) 7 8 9 require 'bloominsimple' 10 require 'digest/sha1' 11 require 'pp' 12 13 # Create a simple bloom filter that uses a SHA1 hash (more effective than BloominSimple's default hashing) 14 b = BloominSimple.new(50000) do |word| 15 Digest::SHA1.digest(word.downcase.strip).unpack("VVV") 16 end 17 18 # Add stopwords to the bloom filter! 19 stopwords = [] 20 File.open('stopwords').each { |a| b.add(a); stopwords << a.downcase.strip } 21 22 # Read in a whole dictionary of regular words 23 words = File.open('/usr/share/dict/words').read.split.collect{|a| a.downcase.strip } 24 25 # Define two ways to detect stopwords for comparison.. 26 using_filter = lambda { |word| b.includes?(word) } 27 using_array = lambda { |word| stopwords.include?(word.downcase.strip) } 28 techniques = [using_filter, using_array] 29 30 # Run stopword comparisons with both techniques 31 t = techniques.collect { |l| words.collect { |a| l[a] } } 32 33 # See how effective the bloom filter has been compared to the literal search 34 if t[0] == t[1] 35 puts "GOOD" 36 else 37 words.zip(t[0],t[1]).each do |x| 38 puts x.first if x[1] != x[2] 39 end 40 end 41 42 # Now do speed benchmarks.. 43 techniques.each { |l| puts Benchmark.measure { words.each { |a| l[a] } } }