Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

About this user

« Newer Snippets
Older Snippets »
Showing 1-4 of 4 total  RSS 

Regex-based parsing technique


#!/usr/local/bin/ruby -w

# Regular expressions and strings with embedded objects
# From: http://t-a-w.blogspot.com/2007/06/regular-expressions-and-strings-with.html
# Author: Tomasz Węgrzanowski
# License: 
# Creative Commons License, http://creativecommons.org/licenses/by-sa/3.0/
# GNU Free Documentation License, http://en.wikipedia.org/wiki/GNU_Free_Documentation_License


def hash_or_die(kw)
  Hash.new{|ht,k| raise "Unknown key: #{k}"}.merge(kw)
end

def parse(data)
  esc = hash_or_die "\\" => "A", "\"" => "B", "n" => "C", "'" => "D"
  rev_esc = hash_or_die "A" => "\\", 'B' => "\"", "C" => "n", "D" => "'"
  data = data.gsub(/\\(.)/) {"\x00" + esc[$1]}
  strs = []
  data = data.gsub(/('[^']*')/) { # '
    strs << $1
    "\x01<#{strs.size-1}>"
  }
  records = []
  data.scan(/\((.*?)\)/) {
    records << $1.split(/,/).map{|field|
      field.gsub(/\x01<(\d+)>/) {
        strs[$1.to_i]}.gsub(/\x00(.)/){ rev_esc[$1]
      }
    }
  }
  records
end

def sql_str_unquote(str)
  str =~ /\A'(.*)'\Z/ or raise "SQL string format is wrong: #{str}"
  $1.gsub(/\\(.)/) {$1}
end


=begin

page_fn = Dir["plwiki-*-page.sql"].sort[-1]
externallinks_fn = Dir["plwiki-*-externallinks.sql"].sort[-1]

pages = {}

File.open(page_fn).each{|line|
  next unless line =~ /\AINSERT INTO `page` VALUES (.*)\Z/
  parse($1).each{|id,ns,title,*stuff|
    next unless ns == "0"
    title = sql_str_unquote(title)
    pages[id] = title
  }
}

File.open(externallinks_fn).each{|line|
  next unless line =~ /\AINSERT INTO `externallinks` VALUES (.*)\Z/
  parse($1).each{|from,to,index|
    title = pages[from]
    next unless title
    to = sql_str_unquote(to)
    next unless to =~ /\Ahttp:\/\//
    puts "#{title}\t#{to}"
  }
}

=end


sql_dump = <<-EOS

INSERT INTO `page` VALUES (1,0,'Astronomia','',1800,0,0,0.600461925007833,'20070601091320',8076762,8584,0), (2,0,'AWK','',329,0,0,0.487812640599732,'20070530195555',8058046,4265,0), (4,0,'Alergologia','',108,0,0,0.580574716050713,'20070520093413',7912844,292,0), ...
INSERT INTO `page` VALUES (14880,0,'Dźwignica_linotorowa','',26,0,0,0.597327036408081,'20060814072401',4282357,727,0), (14881,0,'Urządzenia_transportowe','',91,0,0,0.176666489966834,'20070527090143',2976610,1041,0), ...

EOS


pages = {}

sql_dump.each{|line|
  next unless line =~ /\AINSERT INTO `page` VALUES (.*)\Z/
  parse($1).each{|id,ns,title,*stuff|
    next unless ns == "0"
    title = sql_str_unquote(title)
    pages[id] = title
  }
}


p pages

=begin

sql_dump.each{|line|
  next unless line =~ /\AINSERT INTO `externallinks` VALUES (.*)\Z/
  parse($1).each{|from,to,index|
    title = pages[from]
    next unless title
    to = sql_str_unquote(to)
    next unless to =~ /\Ahttp:\/\//
    puts "#{title}\t#{to}"
  }
}

=end


#-----------------


require 'pp'

lisp_code = '(a (b c) (d (e) f g) (((h))))'
nodes = []

lisp_code.gsub!(/([a-z]+)/) {
  nodes << [:atom, $1]
  "<#{nodes.size-1}>"
}

#p nodes

lisp_code.gsub!(/\s/,"")
#puts lisp_code

true while lisp_code.gsub!(/\(((?:<\d+>)*)\)/) {
  #p nodes
  nodes << [:app, *$1.scan(/<(\d+)>/).map{|x,| nodes[x.to_i]}]
  "<#{nodes.size-1}>"
}
lisp_code =~ /<(\d+)>/

#puts
#p nodes
#puts

pp nodes[$1.to_i]

# Output:
#  [:app,
#  [:atom, "a"],
#  [:app, [:atom, "b"], [:atom, "c"]],
#  [:app, [:atom, "d"], [:app, [:atom, "e"]], [:atom, "f"], [:atom, "g"]],
#  [:app, [:app, [:app, [:atom, "h"]]]]]


#------------------


math_code = '(2 + 2 * 2) / ((2 + 2) * 2)'
nodes = []

math_code.gsub!(/(\d+)/) {
  nodes << $1.to_i
  "<#{nodes.size-1}>"
}
math_code.gsub!(/\s/,"")

until math_code =~ /\A<(\d+)>\Z/
  next if math_code.gsub!(/\((<\d+>)\)/) { $1 }
  next if math_code.gsub!(/<(\d+)>([\*\/])<(\d+)>/) {
    nodes << [$2, nodes[$1.to_i], nodes[$3.to_i]]
    "<#{nodes.size-1}>"
  }
  next if math_code.gsub!(/<(\d+)>([\+\-])<(\d+)>/) {
    nodes << [$2, nodes[$1.to_i], nodes[$3.to_i]]
    "<#{nodes.size-1}>"
  }
end

pp nodes[$1.to_i]

# Output:
# ["/", ["+", 2, ["*", 2, 2]], ["*", ["+", 2, 2], 2]]



CSV parsing regex

The regular expression is taken from Raimond Brookman, Regex fun with CSV.
For a good general CSV overview see The Comma Separated Value (CSV) File Format.
A complete Ruby CSV parsing library is FasterCSV (sudo gem install fastercsv).



csv_data = <<-EOS

fname,lname,age,salary
nancy,davolio,33,$30000
erin,borakova,28,$25250
tony,raphael,35,$28700

"Date","Pupil","Grade"
"25 May","Bloggs, Fred","C"
"25 May","Doe, Jane","B"
"15 July","Bloggs, Fred","D"

123456789,"Carr, Lisa",100000.00
444556666,"Barr, Clark",87000.00
777227878,"Parr, Jack",123000.00
998877665,"Charr, Lee",123000.00

Conference room 1, "John,  
Please bring the M. Mathers file for review  
-J.L.
"
10/18/2002,...

John,Doe,120 jefferson st.,Riverside, NJ, 08075
Jack,McGinnis,220 hobo Av.,Phila, PA,09119
"John ""Da Man""",Repici,120 Jefferson St.,Riverside, NJ,08075
Stephen,Tyler,"7452 Terrace ""At the Plaza"" road",SomeTown,SD, 91234
,Blankman,,SomeTown, SD, 00298
"Joan ""the bone"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,00123

XXXX,D,3-May-02,83.01,83.58,71.13,78.04,9645300
XXXX,D,2-May-02,82.47,85.76,82.05,83.84,7210000,
XXXX,D,1-May-02,86.80,90.83,81.74,85.50,14253300

"1997",car model,E350
1997,car model,E350,"  Super luxurious truck    "
1997,car model,E350,"Go get one now
they are going fast"
1997,car model,E350,"Super ""luxurious"" truck"
1997,car model,E350,"Super, luxurious truck"

1997,car model,E350,"ac, abs, moon",3000.00
1999, car model,"Venture ""Extended Edition""",,4900.00,
1996, car model,Old Car,"BEYOND REPAIR!
air, moon roof, loaded",4799.00

This,is,a test,CSV, file," from ""http://lorance.freeshell.org/csv/test.csv""."
It contains,"quoted text",and,numbers 1234,5678
It also has,"quoted text with an embedded quote""<- right there"
Then there are a few,,blank fields like these here ->,,,
A quoted blank field,"",<- there.
A quoted blank field with newline,"\n",<- there.
This next one causes an error if newline handling is turned off.
"There is a newline here ->
<- and it should be processed correctly."
ABCD
"And here,,, is an""Error - no"
"And here,,, is an"Error - yes
"And here,,, is an",Error - no

1,2,3
ab,"c,d","e""f", "g"",""","h
jk",kl

"aaa","bbb","ccc"
zzz,yyy,xxx
"aaa","b
bb","ccc"
zzz,yyy,xxx

"aaa","b""bb","ccc"

EOS



csv_data.split(/(,|\r\n|\n|\r)(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))/m).each do |csv|
#csv_data.split(/[,\n\r]+(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))/m).each do |csv|

   next if csv.empty?

   csv = csv.strip

   if csv =~ /\A(".*[^"]|[^"].*")\z/m then     # examples: csv => "ab\nc"def  or  abc"de\nf"
       puts
       puts "Error:" 
       p csv 
       puts csv[/\A./mu], csv[/.\z/mu] 
       #puts csv[0..0], csv[-1..-1] 
       puts
       next
   end

   if csv =~ /\A".*"\z/m then csv.gsub!(/\A"(.*)"\z/m, '\1') end  # remove double-quotes at string beginning & end
   if csv =~ /""/m then csv.gsub!(/""/m, '"') end                 # remove a double-quote from double double-quotes

   p csv

end



Parsing UTF-8 encoded strings in Ruby

Instead of using $KCODE = 'UTF8' together with require 'jcode' you can use the /u regex parameter
to parse UTF-8 strings containing multibyte characters.

A Latin1 <-> UTF-8 conversion hack btw can be found here:
http://rubyforge.org/pipermail/fxruby-users/2005-September/000480.html

For comparison just drop the u option!


string = "abc\303\244"  #  \303\244 stands for ä

puts string.scan(/./u).size

puts string.split(//u).reverse.join

puts string.gsub(/.$/u, '')

regex = Regexp.new(/..../u)
md = regex.match(string)
puts md[0].inspect


Parsing a query string with String#scan into a Hash

For more information on the desired hash output, etc. see:

http://redhanded.hobix.com/inspect/injectingAHashBackwardsAndTheMergeBlock.html


qs = "post[id]=4&post[nick]=_why&post[message]=GROSS & FOO!&a=1"  # query string

hash1 = {}
hash2 = {}
resulthash = {}
count = 0

qs.scan(/(post|a)(\[|=)(.*?)(?=(&post\[|&a=|$))/) {

var = $1 << $2 << $3  


case var

when /^(post)\[(.*?)\]=(.*?)$/ then 
               count += 1
               postnum = $1 << count.to_s
               hash1.update(postnum => {$2 => $3})

when /^(a)=(.*?)$/ then resulthash.update($1 => $2)

end


}

hash1.each_pair {|k, v| hash2.update(v)}
resulthash.update("post" => hash2)

puts resulthash.inspect 

« Newer Snippets
Older Snippets »
Showing 1-4 of 4 total  RSS