Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

About this user

Peter Cooperx http://www.petercooper.co.uk/

« Newer Snippets
Older Snippets »
Showing 1-2 of 2 total  RSS 

BloominSimple: Ultra-easy, pure Ruby Bloom filter library

Requires BitField.

#        NAME: BloominSimple
#      AUTHOR: Peter Cooper
#     LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
#   COPYRIGHT: (c) 2007 Peter Cooper
# DESCRIPTION: Very basic, pure Ruby Bloom filter. Uses my BitField, pure Ruby
#              bit field library (http://snippets.dzone.com/posts/show/4234).
#              Supports custom hashing (default is 3).
#
#              Create a Bloom filter that uses default hashing with 1Mbit wide bitfield
#                bf = BloominSimple.new(1_000_000)
#              
#              Add items to it
#                File.open('/usr/share/dict/words').each { |a| bf.add(a) }
#
#              Check for existence of items in the filter
#                bf.includes?("people")     # => true
#                bf.includes?("kwyjibo")    # => false
#
#              Add better hashing (c'est easy!)
#                require 'digest/sha1'
#                b = BloominSimple.new(1_000_000) do |item|
#                  Digest::SHA1.digest(item.downcase.strip).unpack("VVVV")
#                end
#
#              More
#                %w{wonderful ball stereo jester flag shshshshsh nooooooo newyorkcity}.each do |a|
#                  puts "#{sprintf("%15s", a)}: #{b.includes?(a)}"
#                end
#
#                 #  =>   wonderful: true
#                 #  =>        ball: true
#                 #  =>      stereo: true
#                 #  =>      jester: true
#                 #  =>        flag: true
#                 #  =>  shshshshsh: false
#                 #  =>    nooooooo: false
#                 #  => newyorkcity: false

require 'benchmark'
require 'bitfield'

class BloominSimple
  attr_reader :bitfield, :hasher
  
  def initialize(bitsize, &block)
    @bitfield = BitField.new(bitsize)
    @size = bitsize
    @hasher = block || lambda do |word|
      word = word.downcase.strip
      [h1 = word.sum, h2 = word.hash, h2 + h1 ** 3]
    end
  end
  
  def add(item)
    @hasher[item].each { |hi| @bitfield[hi % @size] = 1 }
  end
  
  def includes?(item)
    @hasher[item].each { |hi| return false unless @bitfield[hi % @size] == 1 } and true
  end
end

BitField: A fast(ish), pure Ruby bit field "type"

#        NAME: BitField
#      AUTHOR: Peter Cooper
#     LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
#   COPYRIGHT: (c) 2007 Peter Cooper (http://www.petercooper.co.uk/)
#     VERSION: v4
#     HISTORY: v4 (fixed bug where setting 0 bits to 0 caused a set to 1)
#              v3 (supports dynamic bitwidths for array elements.. now doing 32 bit widths default)
#              v2 (now uses 1 << y, rather than 2 ** y .. it's 21.8 times faster!)
#              v1 (first release)
#
# DESCRIPTION: Basic, pure Ruby bit field. Pretty fast (for what it is) and memory efficient.
#              I've written a pretty intensive test suite for it and it passes great. 
#              Works well for Bloom filters (the reason I wrote it).
#
#              Create a bit field 1000 bits wide
#                bf = BitField.new(1000)
#
#              Setting and reading bits
#                bf[100] = 1
#                bf[100]    .. => 1
#                bf[100] = 0
#
#              More
#                bf.to_s = "10101000101010101"  (example)
#                bf.total_set         .. => 10  (example - 10 bits are set to "1")

class BitField
  attr_reader :size
  include Enumerable
  
  ELEMENT_WIDTH = 32
  
  def initialize(size)
    @size = size
    @field = Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
  end
  
  # Set a bit (1/0)
  def []=(position, value)
    if value == 1
      @field[position / ELEMENT_WIDTH] |= 1 << (position % ELEMENT_WIDTH)
    elsif (@field[position / ELEMENT_WIDTH]) & (1 << (position % ELEMENT_WIDTH)) != 0
      @field[position / ELEMENT_WIDTH] ^= 1 << (position % ELEMENT_WIDTH)
    end
  end
  
  # Read a bit (1/0)
  def [](position)
    @field[position / ELEMENT_WIDTH] & 1 << (position % ELEMENT_WIDTH) > 0 ? 1 : 0
  end
  
  # Iterate over each bit
  def each(&block)
    @size.times { |position| yield self[position] }
  end
  
  # Returns the field as a string like "0101010100111100," etc.
  def to_s
    inject("") { |a, b| a + b.to_s }
  end
  
  # Returns the total number of bits that are set
  # (The technique used here is about 6 times faster than using each or inject direct on the bitfield)
  def total_set
    @field.inject(0) { |a, byte| a += byte & 1 and byte >>= 1 until byte == 0; a }
  end
end


Here's the tests if you want to run:

require "test/unit"
require "bitfield"

class TestLibraryFileName < Test::Unit::TestCase
  def setup
    @public_bf = BitField.new(1000)
  end
  
  def test_basic
    assert_equal 0, BitField.new(100)[0]
    assert_equal 0, BitField.new(100)[1]
  end
  
  def test_setting_and_unsetting
    @public_bf[100] = 1
    assert_equal 1, @public_bf[100]
    @public_bf[100] = 0
    assert_equal 0, @public_bf[100]
  end

  def test_random_setting_and_unsetting
    100.times do
      index = rand(1000)
      @public_bf[index] = 1
      assert_equal 1, @public_bf[index]
      @public_bf[index] = 0
      assert_equal 0, @public_bf[index]
    end
  end
  
  def test_multiple_setting
    1.upto(999) do |pos|
      2.times { @public_bf[pos] = 1 }
      assert_equal 1, @public_bf[pos]
    end
  end

  def test_multiple_unsetting
    1.upto(999) do |pos|
      2.times { @public_bf[pos] = 0 }
      assert_equal 0, @public_bf[pos]
    end
  end
  
  def test_size
    assert_equal 1000, @public_bf.size
  end
  
  def test_to_s
    bf = BitField.new(10)
    bf[1] = 1
    bf[5] = 1
    assert_equal "0100010000", bf.to_s
  end
  
  def test_total_set
    bf = BitField.new(10)
    bf[1] = 1
    bf[5] = 1
    assert_equal 2, bf.total_set
  end
end
« Newer Snippets
Older Snippets »
Showing 1-2 of 2 total  RSS