BloominSimple: Ultra-easy, pure Ruby Bloom filter library
1 2 # NAME: BloominSimple 3 # AUTHOR: Peter Cooper 4 # LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php ) 5 # COPYRIGHT: (c) 2007 Peter Cooper 6 # DESCRIPTION: Very basic, pure Ruby Bloom filter. Uses my BitField, pure Ruby 7 # bit field library (http://snippets.dzone.com/posts/show/4234). 8 # Supports custom hashing (default is 3). 9 # 10 # Create a Bloom filter that uses default hashing with 1Mbit wide bitfield 11 # bf = BloominSimple.new(1_000_000) 12 # 13 # Add items to it 14 # File.open('/usr/share/dict/words').each { |a| bf.add(a) } 15 # 16 # Check for existence of items in the filter 17 # bf.includes?("people") # => true 18 # bf.includes?("kwyjibo") # => false 19 # 20 # Add better hashing (c'est easy!) 21 # require 'digest/sha1' 22 # b = BloominSimple.new(1_000_000) do |item| 23 # Digest::SHA1.digest(item.downcase.strip).unpack("VVVV") 24 # end 25 # 26 # More 27 # %w{wonderful ball stereo jester flag shshshshsh nooooooo newyorkcity}.each do |a| 28 # puts "#{sprintf("%15s", a)}: #{b.includes?(a)}" 29 # end 30 # 31 # # => wonderful: true 32 # # => ball: true 33 # # => stereo: true 34 # # => jester: true 35 # # => flag: true 36 # # => shshshshsh: false 37 # # => nooooooo: false 38 # # => newyorkcity: false 39 40 require 'benchmark' 41 require 'bitfield' 42 43 class BloominSimple 44 attr_reader :bitfield, :hasher 45 46 def initialize(bitsize, &block) 47 @bitfield = BitField.new(bitsize) 48 @size = bitsize 49 @hasher = block || lambda do |word| 50 word = word.downcase.strip 51 [h1 = word.sum, h2 = word.hash, h2 + h1 ** 3] 52 end 53 end 54 55 def add(item) 56 @hasher[item].each { |hi| @bitfield[hi % @size] = 1 } 57 end 58 59 def includes?(item) 60 @hasher[item].each { |hi| return false unless @bitfield[hi % @size] == 1 } and true 61 end 62 end