Basic implementation of Regular Expressions based on "Derivatives of Regular Expressions" by Janusz A. Brzozowski (Journal of Association for Computing Machinery, October 1964)
Not really intended for serious use. Just a proof of concept.
module Regexp
where
import Data.Set (Set)
import Data.Map (Map)
import Monad
import List
import Maybe
import qualified Data.Set as Set
import qualified Data.Map as Map
data Regexp =
Zero
| Match Char -- matches a single char
| Not Regexp -- matches the negation of its argument
| Prod [Regexp] -- matches a concatentation of its arguments
| Sum (Set Regexp) -- matches either of its arguments
| Star Regexp -- matches repetitions of its argument (including 0 repetitions)
deriving (Eq, Ord)
instance Show Regexp where
show Zero = "0"
show (Match c) = [c]
show (Not x) = '~' : show x
show (Prod x) = join . (map show) $ x
show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")"
show (Star x) = "(" ++ show x ++ ")*"
-- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular
-- expressions.
class Match a where
match :: a -> Regexp
instance Match Char where
match c = Match c
instance (Match a) => Match [a] where
match = con
instance Match Regexp where
match = id
-- "smart" versions of the constructors, which perform normalisation of the datatype.
-- As long as all regular expressions are built up using these and the match instance
-- for char we can guarantee that structural equality of terms == similarity.
-- This is important to make sure we only generate a finite number of states.
zero :: Regexp
zero = Zero
one :: Regexp
one = Prod []
(<+>) :: (Match a, Match b) => a -> b -> Regexp
x <+> y =
case (match x, match y) of
(Zero, b) -> b
(a, Zero) -> a
(Sum a, Sum b) -> Sum (Set.union a b)
(Sum a, b) -> Sum (Set.insert b a)
(a, Sum b) -> Sum (Set.insert a b)
(a, b) -> Sum $ Set.fromList [a, b]
oneOf :: (Match a) => [a] -> Regexp
oneOf = foldr (<+>) zero
(<*>) :: (Match a, Match b) => a -> b -> Regexp
u <*> v =
case (match u, match v) of
(Zero, _) -> zero
(_, Zero) -> zero
(Prod x, Prod y) -> Prod (x ++ y)
(Prod x, y) -> Prod (x ++ [y])
(x, Prod y) -> Prod (x : y)
(x, y) -> Prod [x, y]
con :: (Match a) => [a] -> Regexp
con = foldr (<*>) one
neg :: (Match a) => a -> Regexp
neg x =
case (match x) of
(Not y) -> y
y -> Not y
star :: (Match a) => a -> Regexp
star x =
case (match x) of
(Zero) -> Zero
(Star y) -> Star y
y -> Star y
-- Returns if the regex matches the empty string.
del :: Regexp -> Bool
del (Zero) = False
del (Sum x) = or . map del $ Set.toList x
del (Prod x) = and . map del $ x
del (Match _) = False
del (Not x) = not $ del x;
del (Star _) = True
-- The derivative of a regular language A with respect to a character
-- c is dA/dc = { s : cs \in A }
diff :: Char -> Regexp -> Regexp
diff _ (Zero) = zero
diff c (Match d) | (c == d) = one
diff c (Match d) = zero
diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x)
diff c (Prod []) = zero
diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs)
diff c (Prod (x:xs)) = diff c x <*> xs
diff c (Not x) = Not (diff c x)
diff c (Star x) = diff c x <*> Star x
flattenSet :: (Ord a) => Set (Set a) -> Set a
flattenSet = Set.fold Set.union Set.empty
(/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b
x />>= f = flattenSet (Set.map f x)
-- The alphabet of all characters that appear in this regexp
alphabet :: Regexp -> Set Char
alphabet (Zero) = Set.empty
alphabet (Sum x) = flattenSet (Set.map alphabet x)
alphabet (Prod x) = Set.unions $ map alphabet x
alphabet (Not x) = alphabet x
alphabet (Star x) = alphabet x
alphabet (Match c) = Set.singleton c
-- Set of all derivatives of a regular expression (including itself, and higher order derivatives).
derivatives :: Regexp -> [Regexp]
derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp)
where
alpha = alphabet exp
firstDerivatives x = Set.map (`diff` x) alpha
enlarge :: Set Regexp -> Set Regexp -> Set Regexp
enlarge new found =
if Set.null new
then found
else
let nextNew = (new />>= firstDerivatives) Set.\\ found
nextFound = found `Set.union` nextNew
in enlarge nextNew nextFound
-- A simple finite state machine type
data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool }
-- Converts a Regexp into a finite state machine by using the derivatives
-- with respect to specific characters as the transitions. Essentially at
-- each stage we build up a regular expression that the remaining characters
-- have to match. Due to Cunning Mathematics, only finitely many such regular
-- expressions (up to similarity) result.
compile :: Regexp -> FSM
compile x = fromJust $ Map.lookup x states
where
states :: Map Regexp FSM
states = Map.fromList $
do re <- derivatives x -- Totally gratuitious use of list monad. :)
let trans = do c <- Set.toList $ alphabet re
let d = diff c re
return (c, fromJust $ Map.lookup d states)
let state = State (Map.fromList trans) (del re)
return (re, state)
runFSM :: FSM -> String -> Bool
runFSM x [] = isFinal x
runFSM x (c:cs) = case (Map.lookup c $ transitions x) of
Nothing -> False
Just y -> runFSM y cs
matches :: (Match a) => String -> a -> Bool
matches cs exp = runFSM (compile $ match exp) cs