Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Matching quoted strings in Ruby (See related posts)

An exercise in string processing and regexp matching, inspired by Parsing Quoted Strings in Ruby and Stupid Ruby Quoting Tricks.

#!/usr/local/bin/ruby -w

# some input examples
str = 'foo "bar baz" qux'
str = 'foo "bar baz " "bar baz" " bar baz" "bar "klr mre" " " \' "abc" \' baz " qux'
str = '" \' \'    " \n "   " \' \' "" foo \'ttt sss\' "bar "qqq zzz" baz" "added term" qux  " \' \'    "  yyy xxx'
str = '"""frickin \'#{bar}\'"""'
str = '""    "frickin chicken "    #{bar}""""'
str = '"""frickin "#{bar}""""'
str = '"a "b c" "d "e" f g" """h""""'       # cf. http://snippets.dzone.com/posts/show/4852

# escaped quotes
str = '\"'
str = "\\\""
str = '\\\''
str = "\\'"

# special cases
str = '"G","H I"'
str = '"G","H I""G","H I"'

str = '"abc""def"'
str = '"""a""b"'
str = '"abc""def""abc""def""abc""def"'
str = '"a"\'\'"b"'

str = "abc,def,\"efg,hij\",klm,nop,\"qrstuv\",wxyz";


puts
puts "input string:  #{str}" 
puts "str.inspect :  #{str.inspect}" 
puts

num_of_chars1 = str.count('a-zA-Z_0-9', "^\000ds")

error_code = 0      # in case of a parsing error Shellwords will be used instead of regex1 & regex2
str2 = str.clone

# encode escaped quotes
str = str.gsub(/\\"|\\'/) { |m| m =~ /^\\"$/ ? "\000d\000" : "\000s\000" }

dq_count = str.count('"')
sq_count = str.count("'")

if dq_count % 2 != 0 && sq_count % 2 != 0
   raise ArgumentError, "\e[1modd number of single & double quotes\e[m in: #{str}\nsq_count: #{sq_count}\ndq_count: #{dq_count}\n"
elsif dq_count % 2 != 0
   raise ArgumentError, "\e[1modd number of double quotes\e[m in: #{str}\ndq_count: #{dq_count}\n"
elsif sq_count % 2 != 0
   raise ArgumentError, "\e[1modd number of single quotes\e[m in: #{str}\nsq_count: #{sq_count}\n"
end

# regex1 separates substrings that contain quotes from substrings that do not contain quotes
regex1 = %r{[^"']+|["'].*?["'](?!.*["'])}m  

# example
#"abc 'quote1' pjk 'quote2' xyz".scan(regex1) { |m| puts m } 


regex2 = %r{
\s*["'][^"']+["'][[:punct:]]["'][^"']+["']|  # special case:  xxx "ab c","def g" yyy
\s*["'][^"']+["']{2,}[^"']+["']|             # special case:  xxx "abc""def" yyy
\s*["']\S+["']|

\s'\s|                       # xxx ' yyy
\s"\s|                       # xxx " yyy
\s''\s|                      # xxx '' yyy
\s""\s|                      # xxx "" yyy
\s'\s+'\s|                   # xxx '   ' yyy
\s"\s+"\s|                   # xxx "   " yyy
\s"\s[^"]+\s"\s|             # xxx " abc " yyy
\s'\s[^']+\s'\s|             # xxx ' abc ' yyy
\s["']["']+(?=[^"'\s])|      # :qoblock:  xxx "'""'abc yyy
[^"'\s]["']["']+(?=\s)|      # :qcblock:  xxx abc"'""' yyy
\s""+|                       # :dqoblock:  xxx """abc yyy
\s''+|                       # :sqoblock:  xxx '''abc yyy
[^"]""+|                     # :dqcblock:  xxx abc"" yyy
[^']''+|                     # :sqcblock:  xxx abc'' yyy
\s["'](?=[^"'\s])|           # :dqo or :sqo:  xxx "abc yyy  or  xxx 'abc yyy
[^"'\s]["'](?=\s)|           # :dqc or :sqc:  xxx abc" yyy  or  xxx abc' yyy
[^"']+[^"'\s](?=\s)          # no quotes at all
}mx


=begin

There are different kinds of quotes matched by regex2 below. They include:

- :sqo (single quote open)
- :sqc (single quote close)
- :sqoblock (single quote open block)
- :sqcblock (single quote close block)

- :dqo (double quote open)
- :dqc (double quote close)
- :dqoblock (double quote open block)
- :dqcblock (double quote close block)

- :qoblock (quote open block)
- :qcblock (quote close block)

=end


ret = []

str.scan(regex1) do |s| 

   if s !~ /\A["']/

      #puts "s1: #{s}"
      #puts "s1.inspect: #{s.inspect}"

      s.split(/\s+/m).each { |t| ret << t unless t.empty? }

   else

      #puts "s2: #{s}"
      #puts "s2.inspect: #{s.inspect}"

      open_quotes = 0
      close_quotes = 0
      ar = []

      # add spaces to simplify regex2 matching
      s = "\x20" << s << "\x20"    
      s.gsub!(/\x20/, "\x20\x20")  


      s.scan(regex2) do |m|

         # get the index of the quote
         # + 1 for leading space or non-space
         # $` is the prematch string

         index = $`.length + 1 

         post_match = $'  

         #puts
         #puts "index: #{index}"
         #puts "m: #{m.inspect}"
         #puts "m.length: #{m.length}"
         #puts "open_quotes:  #{open_quotes}\nclose_quotes: #{close_quotes}"
         #puts "ret: #{ret.inspect}"
         #puts "ar: #{ar.inspect}"
         #puts


         if m =~ /\A\s''\s\z/

            next unless open_quotes == 0 && close_quotes == 0
            ret << ''
            next

         elsif m =~ /\A\s""\s\z/

            next unless open_quotes == 0 && close_quotes == 0
            ret << ""
            next

         # example: xxx "ab c","def g" yyy
         elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["'][^"']+["'][[:punct:]]["'][^"']+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0           

            m = m.gsub(/\x20\x20/, "\x20")
            # cf. http://henrik.nyh.se/2008/03/flickr-style-tag-splitting-in-ruby
            m = m.split(/"(.+?)"|\s+/).reject {|s| s.empty? }
            #m = m.split(/"(.+?)"|'(.+?)'|\s+/).reject {|s| s.empty? }
            #m = m.split(/"(.+?)"|'(.+?)'|([[:punct:]])|\s+/).reject {|s| s.empty? }
            ret.concat(m)
            next

         # example: xxx "abc""def" yyy
         elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["'][^"']+["']{2,}[^"']+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0           
            
            m = m.gsub(/\x20\x20/, "\x20")
            m = m.split(/"(.+?)"|\s+/).reject {|s| s.empty? }
            #m = m.split(/"(.+?)"|'(.+?)'|\s+/).reject {|s| s.empty? }
            ret.concat(m)
            next

         elsif open_quotes.zero? && close_quotes.zero? && m =~ /\A\s*["']\S+["']\z/ && m.count('"') % 2 == 0 && m.count("'") % 2 == 0

            ret.concat(m.split(/"(.+?)"|\s+/).reject {|s| s.empty? })
            next

         elsif m =~ /\A\s"\s[^"]+\s"\s\z/

            next unless open_quotes == 0 && close_quotes == 0
            ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
            next

         elsif m =~ /\A\s'\s[^']+\s"\s\z/

            next unless open_quotes == 0 && close_quotes == 0
            ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
            next

         elsif m =~ /\A\s'\s+'\s\z/

            next unless open_quotes == 0 && close_quotes == 0
            ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
            next

         elsif m =~ /\A\s"\s+"\s\z/

           next unless open_quotes == 0 && close_quotes == 0
           ret << m.gsub(/\x20\x20/, "\x20").strip[1..-2]
           next


         elsif m =~ /\A\s""+\z/

            l = m.strip.length
            ar << [:dqoblock, index, l]
            old_open_quotes = open_quotes
            open_quotes += l

            if close_quotes == 0 && old_open_quotes == 0 && open_quotes % 2 == 0 && post_match !~ /"/
               ret << m[2..-2] 
               open_quotes = 0
               ar.pop
               next
            end


         elsif m =~ /\A\s''+\z/

            l = m.strip.length
            ar << [:sqoblock, index, l]
            old_open_quotes = open_quotes
            open_quotes += l

            if close_quotes == 0 && old_open_quotes == 0 && open_quotes % 2 == 0 && post_match !~ /'/
               ret << m[2..-2] 
               open_quotes = 0
               ar.pop
               next
            end


         elsif m =~ /\A[^"]""+\z/

            l = m[1..-1].strip.length
            ar << [:dqcblock, index+l-1, l]      #  index+l-1 is the index of the last closing quote: ''"'[']
            old_close_quotes = close_quotes
            close_quotes += l

            if open_quotes == 0 && old_close_quotes == 0 && close_quotes % 2 == 0 && post_match !~ /"/
               ret << m[2..-2] 
               close_quotes = 0
               ar.pop
               next
            end

         elsif m =~ /\A[^']''+\z/

            l = m[1..-1].strip.length
            ar << [:sqcblock, index+l-1, l]
            old_close_quotes = close_quotes
            close_quotes += l

            if open_quotes == 0 && old_close_quotes == 0 && close_quotes % 2 == 0 && post_match !~ /'/
               ret << m[2..-2] 
               close_quotes = 0
               ar.pop
               next
            end


         elsif m =~ /\A\s'\z/

            ar << [:sqo, index, 1]
            open_quotes += 1

         elsif m =~ /\A\S'\z/

            ar << [:sqc, index, 1]
            close_quotes += 1

         elsif m =~ /\A\s"\z/

            ar << [:dqo, index, 1]
            open_quotes += 1

         elsif m =~ /\A\S"\z/

            ar << [:dqc, index, 1]
            close_quotes += 1


         else


            if m =~ /\A\s"\s\z/              # " surrounded by whitespace

               if open_quotes > close_quotes

                  ar << [:dqc, index, 1]
                  close_quotes += 1

                  # avoid :sqo followed by :dqc or :sqc followed by :dqc
                  if post_match =~ /"/ && open_quotes == close_quotes && (ar.at(-2).first == :sqo || ar.at(-2).first == :sqc)
                     ar.pop
                     ar << [:dqo, index, 1]
                     close_quotes -= 1
                     open_quotes += 1
                  end

               else 

                  ar << [:dqo, index, 1]
                  open_quotes += 1

               end


            elsif m =~ /\A\s'\s\z/          # ' surrounded by whitespace

               if open_quotes > close_quotes

                  ar << [:sqc, index, 1]
                  close_quotes += 1

                  # avoid :dqo followed by :sqc or :dqc followed by :sqc
                  if post_match =~ /'/ && open_quotes == close_quotes && (ar.at(-2).first == :dqo || ar.at(-2).first == :dqc)
                     ar.pop
                     ar << [:sqo, index, 1]
                     close_quotes -= 1
                     open_quotes += 1
                  end

               else 

                  ar << [:sqo, index, 1]
                  open_quotes += 1

               end


            elsif m =~ /\A\s["']["']+\z/              # :qoblock: xxx "'""'abc yyy

               l = m[1..-1].strip.length
               ar << [:qoblock, index, l]
               old_open_quotes = open_quotes
               open_quotes += l

               if close_quotes == 0 && old_open_quotes == 0 && open_quotes % 2 == 0 && post_match !~ /["']/
                  ret << m[2..-2] 
                  open_quotes = 0
                  ar.pop
                  next
               end


            elsif m =~ /\A[^"'\s]["']["']+\z/          # :qcblock: xxx abc"'""' yyy

               l = m[1..-1].strip.length
               ar << [:qcblock, index+l-1, l]
               old_close_quotes = close_quotes
               close_quotes += l

               if open_quotes == 0 && old_close_quotes == 0 && close_quotes % 2 == 0 && post_match !~ /["']/
                  ret << m[2..-2] 
                  close_quotes = 0
                  ar.pop
                  next
               end


            elsif m =~ /\A[^"']+[^"'\s]\z/          # part of quoted substring contains neither " nor '
               next unless open_quotes == 0 && close_quotes == 0
               next if m.strip.empty?
               ret << m.gsub(/\x20\x20/, "\x20").strip; next
            end

         end

         puts
         puts "open_quotes:  #{open_quotes}\nclose_quotes: #{close_quotes}\n"
         #puts "ar: #{ar.inspect}"

         if open_quotes == close_quotes

            #puts "open_quotes & close_quotes: #{close_quotes}"
            puts "ar: #{ar.inspect}"

            ret << s[ar.first[1]..ar.last[1]].gsub(/\x20\x20/, "\x20")[1..-2] unless ar.empty?

            ar.clear
            open_quotes = 0
            close_quotes = 0

         end

      end   # scan 2

      unless open_quotes.zero? && close_quotes.zero?
        error_code = 1
        puts "\e[1mparsing error\e[m for the quoted string: #{str.strip.squeeze[0..20]}"
        #raise "\e[1mparsing error\e[m for the quoted string: #{str.strip.squeeze[0..20]}"
      end

   end   # if

end   # scan 1



num_of_chars2 = ret.join.count('a-zA-Z_0-9', "^\000ds")

unless num_of_chars1 == num_of_chars2
   error_code = 1
   puts "\n\e[1mparsing error due to wrong number of characters a-zA-Z_0-9\e[m: \n#{num_of_chars2} instead of #{num_of_chars1}\n"
   #raise "\e[1mparsing error due to wrong number of characters a-zA-Z_0-9\e[m: \n#{num_of_chars2} instead of #{num_of_chars1}\n in #{str.strip.squeeze[0..20]}"
end


# use Shellwords in case the quote matching above failed
if error_code == 1       
#if error_code == 1 || ret.join =~ /\A["']+\z/        
   require 'shellwords'
   ret.clear
   ret.concat(Shellwords::shellwords(str))
   #str =~ /\A\S+\z/ ? ret.concat(str.split(/"(.+?)"|\s+/).reject {|s| s.empty? }) : ret.concat(Shellwords::shellwords(str))
end 



puts "\n\e[1mResult\e[m:\n\n"
ret.each_with_index do |t,i| 
   # decode encoded escaped quotes 
   t = t.gsub(/\000d\000|\000s\000/) { |m| m =~ /^\000d\000$/ ? '\"' : "\\'" }
   puts "#{i+1}:  #{t.inspect}" 
end

puts "\n\e[1mShellwords\e[m:\n\n"
require 'shellwords'
Shellwords::shellwords(str2).each_with_index { |t,i| puts "#{i+1}:  #{t.inspect}" }


#----------------------


# matching quoted strings using backreferences
# See: Regexes in Depth: Advanced Quoted String Matching,
# http://blog.stevenlevithan.com/archives/match-quoted-string

str = '"abc"'

regex = %r{(["'])([^"']*)(\1)}
regex = %r{(["'])([^\1]*)(\1)}
p regex

str.scan(regex) { |m| p m; p $1 << $2 << $3 }

You need to create an account or log in to post comments to this site.


Click here to browse all 4834 code snippets

Related Posts