Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

About this user

David R. MacIver http://unenterprise.blogspot.com

« Newer Snippets
Older Snippets »
Showing 1-1 of 1 total  RSS 

Haskell Regular Expression Matcher

Basic implementation of Regular Expressions based on "Derivatives of Regular Expressions" by Janusz A. Brzozowski (Journal of Association for Computing Machinery, October 1964)

Not really intended for serious use. Just a proof of concept.

   1  
   2  module Regexp
   3  where
   4  
   5  import Data.Set (Set)
   6  import Data.Map (Map)
   7  import Monad
   8  import List
   9  import Maybe
  10  import qualified Data.Set as Set
  11  import qualified Data.Map as Map
  12  
  13  data Regexp = 
  14      Zero
  15    | Match Char       -- matches a single char
  16    | Not Regexp       -- matches the negation of its argument
  17    | Prod [Regexp]    -- matches a concatentation of its arguments
  18    | Sum (Set Regexp) -- matches either of its arguments
  19    | Star Regexp      -- matches repetitions of its argument (including 0 repetitions)
  20    deriving (Eq, Ord)
  21  
  22  instance Show Regexp where
  23    show Zero = "0"
  24    show (Match c) = [c]
  25    show (Not x)   = '~' : show x
  26    show (Prod x) = join . (map show)  $ x
  27    show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")"
  28    show (Star x) = "(" ++ show x ++ ")*" 
  29  
  30  -- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular
  31  -- expressions.
  32  class Match a where
  33    match :: a -> Regexp
  34  
  35  instance Match Char where
  36    match c = Match c
  37  
  38  instance (Match a) => Match [a] where
  39    match = con
  40  
  41  instance Match Regexp where
  42    match = id
  43  
  44  -- "smart" versions of the constructors, which perform normalisation of the datatype.
  45  -- As long as all regular expressions are built up using these and the match instance
  46  -- for char we can guarantee that structural equality of terms == similarity.
  47  -- This is important to make sure we only generate a finite number of states.
  48  zero :: Regexp 
  49  zero = Zero 
  50  
  51  one :: Regexp
  52  one = Prod []
  53  
  54  (<+>) :: (Match a, Match b) => a -> b -> Regexp
  55  x <+> y = 
  56    case (match x, match y) of
  57      (Zero, b)      -> b
  58      (a, Zero)      -> a
  59      (Sum a, Sum b) -> Sum (Set.union a b)
  60      (Sum a, b)     -> Sum (Set.insert b a)
  61      (a, Sum b)     -> Sum (Set.insert a b)    
  62      (a, b)         -> Sum $ Set.fromList [a, b]
  63    
  64  oneOf :: (Match a) => [a] -> Regexp
  65  oneOf = foldr (<+>) zero
  66  
  67  (<*>) :: (Match a, Match b) => a -> b -> Regexp
  68  u <*> v = 
  69    case (match u, match v) of 
  70      (Zero, _)         -> zero
  71      (_, Zero)         -> zero
  72      (Prod x, Prod y)  -> Prod (x ++ y)
  73      (Prod x, y)       -> Prod (x ++ [y])
  74      (x, Prod y)       -> Prod (x : y)
  75      (x, y)            -> Prod [x, y]
  76  
  77  con :: (Match a) => [a] -> Regexp
  78  con = foldr (<*>) one
  79  
  80  neg :: (Match a) => a -> Regexp
  81  neg x = 
  82    case (match x) of
  83    (Not y) -> y
  84    y       -> Not y
  85  
  86  star :: (Match a) => a -> Regexp
  87  star x =
  88    case (match x) of
  89      (Zero)   -> Zero
  90      (Star y) -> Star y
  91      y        -> Star y
  92  
  93  -- Returns if the regex matches the empty string.
  94  del :: Regexp -> Bool
  95  del (Zero)    = False
  96  del (Sum x)   = or . map del $ Set.toList x
  97  del (Prod x)  = and . map del $ x
  98  del (Match _) = False
  99  del (Not x)   = not $ del x;
 100  del (Star _)  = True
 101  
 102  -- The derivative of a regular language A with respect to a character
 103  -- c is dA/dc = { s : cs \in A } 
 104  diff :: Char -> Regexp -> Regexp
 105  diff _ (Zero)  = zero
 106  diff c (Match d) | (c == d) = one
 107  diff c (Match d) = zero
 108  diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x)
 109  diff c (Prod []) = zero
 110  diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs)
 111  diff c (Prod (x:xs)) = diff c x <*> xs
 112  diff c (Not x) = Not (diff c x)
 113  diff c (Star x) = diff c x <*> Star x
 114  
 115  flattenSet :: (Ord a) => Set (Set a) -> Set a
 116  flattenSet = Set.fold Set.union Set.empty
 117  
 118  (/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b
 119  x />>= f = flattenSet (Set.map f x)
 120  
 121  -- The alphabet of all characters that appear in this regexp
 122  alphabet :: Regexp -> Set Char
 123  alphabet (Zero) = Set.empty
 124  alphabet (Sum x) = flattenSet (Set.map alphabet x) 
 125  alphabet (Prod x) = Set.unions $ map alphabet x
 126  alphabet (Not x) = alphabet x
 127  alphabet (Star x) = alphabet x
 128  alphabet (Match c) = Set.singleton c
 129  
 130  -- Set of all derivatives of a regular expression (including itself, and higher order derivatives).
 131  derivatives :: Regexp -> [Regexp]
 132  derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp) 
 133    where
 134      alpha = alphabet exp
 135      firstDerivatives x = Set.map (`diff` x) alpha 
 136      enlarge :: Set Regexp -> Set Regexp -> Set Regexp
 137      enlarge new found = 
 138        if Set.null new
 139          then found
 140          else
 141            let nextNew   = (new />>= firstDerivatives) Set.\\ found
 142                nextFound = found `Set.union` nextNew
 143            in enlarge nextNew nextFound
 144  
 145  -- A simple finite state machine type 
 146  data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool } 
 147  
 148  -- Converts a Regexp into a finite state machine by using the derivatives
 149  -- with respect to specific characters as the transitions. Essentially at 
 150  -- each stage we build up a regular expression that the remaining characters
 151  -- have to match. Due to Cunning Mathematics, only finitely many such regular
 152  -- expressions (up to similarity) result.
 153  compile :: Regexp -> FSM
 154  compile x = fromJust $ Map.lookup x states
 155    where
 156      states :: Map Regexp FSM
 157      states = Map.fromList $
 158        do re <- derivatives x -- Totally gratuitious use of list monad. :)
 159           let trans = do c <- Set.toList $ alphabet re
 160                          let d = diff c re
 161                          return (c, fromJust $ Map.lookup d states)
 162           let state = State (Map.fromList trans) (del re) 
 163           return (re, state) 
 164  
 165  runFSM :: FSM -> String -> Bool
 166  runFSM x []     = isFinal x
 167  runFSM x (c:cs) = case (Map.lookup c $ transitions x) of
 168                      Nothing -> False
 169                      Just y  -> runFSM y cs
 170                 
 171  matches :: (Match a) => String -> a -> Bool
 172  matches cs exp = runFSM (compile $  match exp) cs
 173  
« Newer Snippets
Older Snippets »
Showing 1-1 of 1 total  RSS