<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DZone Snippets: indexing code</title>
    <link>http://snippets.dzone.com/posts</link>
    <pubDate>Thu, 24 Jul 2008 06:17:09 GMT</pubDate>
    <description>DZone Snippets: indexing code</description>
    <item>
      <title>Fast stop word detection in Ruby</title>
      <link>http://snippets.dzone.com/posts/show/4236</link>
      <description>Requires &lt;a href="http://snippets.dzone.com/posts/show/4235"&gt;BloominSimple&lt;/a&gt; (a pure Ruby Bloom filter class).&lt;br /&gt;&lt;br /&gt;List of stop words obtained from &lt;a href="http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words"&gt;http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;code&gt;# Detect stop words QUICKLY&lt;br /&gt;# Uses a bloom filter instead of searching literally through a list of stopwords&lt;br /&gt;# for &gt; 3x speed increase&lt;br /&gt;# &lt;br /&gt;#    using bloom filter: 2.580000   0.030000   2.610000 (  2.698829)&lt;br /&gt;#  using literal search: 7.850000   0.120000   7.970000 (  8.181684)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;require 'bloominsimple'&lt;br /&gt;require 'digest/sha1'&lt;br /&gt;require 'pp'&lt;br /&gt;&lt;br /&gt;# Create a simple bloom filter that uses a SHA1 hash (more effective than BloominSimple's default hashing)&lt;br /&gt;b = BloominSimple.new(50000) do |word|&lt;br /&gt;  Digest::SHA1.digest(word.downcase.strip).unpack("VVV")&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;# Add stopwords to the bloom filter!&lt;br /&gt;stopwords = []&lt;br /&gt;File.open('stopwords').each { |a| b.add(a); stopwords &lt;&lt; a.downcase.strip }&lt;br /&gt;&lt;br /&gt;# Read in a whole dictionary of regular words&lt;br /&gt;words = File.open('/usr/share/dict/words').read.split.collect{|a| a.downcase.strip }&lt;br /&gt;&lt;br /&gt;# Define two ways to detect stopwords for comparison..&lt;br /&gt;using_filter = lambda { |word| b.includes?(word) }&lt;br /&gt;using_array = lambda { |word| stopwords.include?(word.downcase.strip) }&lt;br /&gt;techniques = [using_filter, using_array]&lt;br /&gt;&lt;br /&gt;# Run stopword comparisons with both techniques&lt;br /&gt;t = techniques.collect { |l| words.collect { |a| l[a] } }&lt;br /&gt;&lt;br /&gt;# See how effective the bloom filter has been compared to the literal search&lt;br /&gt;if t[0] == t[1]&lt;br /&gt;  puts "GOOD"&lt;br /&gt;else&lt;br /&gt;  words.zip(t[0],t[1]).each do |x|&lt;br /&gt;    puts x.first if x[1] != x[2]&lt;br /&gt;  end&lt;br /&gt;end&lt;br /&gt;&lt;br /&gt;# Now do speed benchmarks..&lt;br /&gt;techniques.each { |l| puts Benchmark.measure { words.each { |a| l[a] } } }&lt;/code&gt;</description>
      <pubDate>Mon, 02 Jul 2007 03:10:16 GMT</pubDate>
      <guid>http://snippets.dzone.com/posts/show/4236</guid>
      <author>peter (Peter Cooperx)</author>
    </item>
  </channel>
</rss>
