An exercise in string processing and regexp matching, inspired by
Parsing Quoted Strings in Ruby and
Stupid Ruby Quoting Tricks.
str = 'foo "bar baz" qux'
str = 'foo "bar baz " "bar baz" " bar baz" "bar "klr mre" " " \' "abc" \' baz " qux'
str = '" \' \' " \n " " \' \' "" foo \'ttt sss\' "bar "qqq zzz" baz" "added term" qux " \' \' " yyy xxx'
str = '"""frickin \'#{bar}\'"""'
str = '"" "frickin chicken " #{bar}""""'
str = '"""frickin "#{bar}""""'
str = '"a "b c" "d "e" f g" """h""""'
str = '\"'
str = "\\\""
str = '\\\''
str = "\\'"
str = '"G","H I"'
str = '"G","H I""G","H I"'
str = '"abc""def"'
str = '"""a""b"'
str = '"abc""def""abc""def""abc""def"'
str = '"a"\'\'"b"'
str = "\"abc'vv'tt\"'klt'"
str = "abc,def,\"efg,hij\",klm,nop,\"qrstuv\",wxyz"
str = "abc,def,\"efg,hij\",klm, 'nop, \"qrstuv\",wxyz,mmm '"
str = "abc,def,\"efg,hij\",klm, \"nop, 'qrstuv',wxyz,mmm \""
puts
puts "input string: #{str}"
puts "str.inspect : #{str.inspect}"
puts
num_of_chars1 = str.count('a-zA-Z_0-9', "^\000ds")
error_code = 0
str2 = str.clone
str = str.gsub(/\\"|\\'/) { |m| m =~ /^\\"$/ ? "\000d\000" : "\000s\000" }
dq_count = str.count('"')
sq_count = str.count("'")
if dq_count % 2 != 0 && sq_count % 2 != 0
raise ArgumentError, "\e[1modd number of single & double quotes\e[m in: #{str}\nsq_count: #{sq_count}\ndq_count: #{dq_count}\n"
elsif dq_count % 2 != 0
raise ArgumentError, "\e[1modd number of double quotes\e[m in: #{str}\ndq_count: #{dq_count}\n"
elsif sq_count % 2 != 0
raise ArgumentError, "\e[1modd number of single quotes\e[m in: #{str}\nsq_count: #{sq_count}\n"
end
regex1 = %r{[^"']+|["'].*?["'](?!.*["'])}m
regex2 = %r{
# experimental: special cases
\s*["'][^"']+["'][[:punct:]]["'][^"']+["']| # special case: xxx "ab c","def g" yyy
\s*["'][^"']+["']{2,}[^"']+["']| # special case: xxx "abc""def" yyy
\s"[^"]+"| # special case: xxx "abc 'def' ghi"
\s'[^']+'|
\s*["']\S+["']| # special case: "abc'vv'tt"'klt'
\s'\s|
\s"\s| # xxx " yyy
\s''\s|
\s""\s|
\s'\s+'\s|
\s"\s+"\s|
\s"\s[^"]+\s"\s| # xxx " abc " yyy
\s'\s[^']+\s'\s| # xxx ' abc ' yyy
\s["']["']+(?=[^"'\s])| # :qoblock: xxx "'""'abc yyy
[^"'\s]["']["']+(?=\s)|
\s""+|
\s''+|
[^"]""+| # :dqcblock: xxx abc"" yyy
[^']''+| # :sqcblock: xxx abc'' yyy
\s["'](?=[^"'\s])|
[^"'\s]["'](?=\s)| # :dqc or :sqc: xxx abc" yyy or xxx abc' yyy
[^"']+[^"'\s](?=\s) # no quotes at all
}mx
=begin
There are different kinds of quotes matched by regex2 below. They include:
- :sqo (single quote open)
- :sqc (single quote close)
- :sqoblock (single quote open block)
- :sqcblock (single quote close block)
- :dqo (double quote open)
- :dqc (double quote close)
- :dqoblock (double quote open block)
- :dqcblock (double quote close block)
- :qoblock (quote open block)
- :qcblock (quote close block)
=end
ret = []
str.scan(regex1) do |s|
if s !~ /\A["']/
s.split(/\s+/m).each { |t| ret << t unless t.empty? }
else
open_quotes = 0
close_quotes = 0
ar = []
s = "\x20" << s << "\x20"
s.gsub!(/\x20/, "\x20\x20")
s.scan(regex2) do |m|
index = $`.length + 1
post_match = $'
if m =~ /\A\s''\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << ''
next
elsif m =~ /\A\s""\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << ""
next
elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["'][^"']+["'][[:punct:]]["'][^"']+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0
m = m.gsub(/\x20\x20/, "\x20")
m = m.split(/"(.+?)"|\s+/).reject {|sm| sm.empty? }
ret.concat(m)
next
elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["'][^"']+["']{2,}[^"']+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0
m = m.gsub(/\x20\x20/, "\x20")
m = m.split(/"(.+?)"|\s+/).reject {|sm| sm.empty? }
ret.concat(m)
next
elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s"[^"]+"\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0
ret.concat(m.split(/"(.+?)"|\s+/).reject {|sm| sm.empty? })
next
elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s'[^']+'\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0
ret.concat(m.split(/'(.+?)'|\s+/).reject {|sm| sm.empty? })
next
elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["']\S+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0
ret.concat(m.split(/"(.+?)"|\s+/).reject {|sm| sm.empty? })
next
elsif m =~ /\A\s"\s[^"]+\s"\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
next
elsif m =~ /\A\s'\s[^']+\s"\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
next
elsif m =~ /\A\s'\s+'\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
next
elsif m =~ /\A\s"\s+"\s\z/
next unless open_quotes == 0 && close_quotes == 0
ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
next
elsif m =~ /\A\s""+\z/
l = m.strip.length
ar << [:dqoblock, index, l]
old_open_quotes = open_quotes
open_quotes += l
if close_quotes == 0 && old_open_quotes == 0 && open_quotes % 2 == 0 && post_match !~ /"/
ret << m[2..-2]
open_quotes = 0
ar.pop
next
end
elsif m =~ /\A\s''+\z/
l = m.strip.length
ar << [:sqoblock, index, l]
old_open_quotes = open_quotes
open_quotes += l
if close_quotes == 0 && old_open_quotes == 0 && open_quotes % 2 == 0 && post_match !~ /'/
ret << m[2..-2]
open_quotes = 0
ar.pop
next
end
elsif m =~ /\A[^"]""+\z/
l = m[1..-1].strip.length
ar << [:dqcblock, index+l-1, l]
old_close_quotes = close_quotes
close_quotes += l
if open_quotes == 0 && old_close_quotes == 0 && close_quotes % 2 == 0 && post_match !~ /"/
ret << m[2..-2]
close_quotes = 0
ar.pop
next
end
elsif m =~ /\A[^']''+\z/
l = m[1..-1].strip.length
ar << [:sqcblock, index+l-1, l]
old_close_quotes = close_quotes
close_quotes += l
if open_quotes == 0 && old_close_quotes == 0 && close_quotes % 2 == 0 && post_match !~ /'/
ret << m[2..-2]
close_quotes = 0
ar.pop
next
end
elsif m =~ /\A\s'\z/
ar << [:sqo, index, 1]
open_quotes += 1
elsif m =~ /\A\S'\z/
ar << [:sqc, index, 1]
close_quotes += 1
elsif m =~ /\A\s"\z/
ar << [:dqo, index, 1]
open_quotes += 1
elsif m =~ /\A\S"\z/
ar << [:dqc, index,