Haskell Regular Expression Matcher
Not really intended for serious use. Just a proof of concept.
module Regexp where import Data.Set (Set) import Data.Map (Map) import Monad import List import Maybe import qualified Data.Set as Set import qualified Data.Map as Map data Regexp = Zero | Match Char -- matches a single char | Not Regexp -- matches the negation of its argument | Prod [Regexp] -- matches a concatentation of its arguments | Sum (Set Regexp) -- matches either of its arguments | Star Regexp -- matches repetitions of its argument (including 0 repetitions) deriving (Eq, Ord) instance Show Regexp where show Zero = "0" show (Match c) = [c] show (Not x) = '~' : show x show (Prod x) = join . (map show) $ x show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")" show (Star x) = "(" ++ show x ++ ")*" -- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular -- expressions. class Match a where match :: a -> Regexp instance Match Char where match c = Match c instance (Match a) => Match [a] where match = con instance Match Regexp where match = id -- "smart" versions of the constructors, which perform normalisation of the datatype. -- As long as all regular expressions are built up using these and the match instance -- for char we can guarantee that structural equality of terms == similarity. -- This is important to make sure we only generate a finite number of states. zero :: Regexp zero = Zero one :: Regexp one = Prod [] (<+>) :: (Match a, Match b) => a -> b -> Regexp x <+> y = case (match x, match y) of (Zero, b) -> b (a, Zero) -> a (Sum a, Sum b) -> Sum (Set.union a b) (Sum a, b) -> Sum (Set.insert b a) (a, Sum b) -> Sum (Set.insert a b) (a, b) -> Sum $ Set.fromList [a, b] oneOf :: (Match a) => [a] -> Regexp oneOf = foldr (<+>) zero (<*>) :: (Match a, Match b) => a -> b -> Regexp u <*> v = case (match u, match v) of (Zero, _) -> zero (_, Zero) -> zero (Prod x, Prod y) -> Prod (x ++ y) (Prod x, y) -> Prod (x ++ [y]) (x, Prod y) -> Prod (x : y) (x, y) -> Prod [x, y] con :: (Match a) => [a] -> Regexp con = foldr (<*>) one neg :: (Match a) => a -> Regexp neg x = case (match x) of (Not y) -> y y -> Not y star :: (Match a) => a -> Regexp star x = case (match x) of (Zero) -> Zero (Star y) -> Star y y -> Star y -- Returns if the regex matches the empty string. del :: Regexp -> Bool del (Zero) = False del (Sum x) = or . map del $ Set.toList x del (Prod x) = and . map del $ x del (Match _) = False del (Not x) = not $ del x; del (Star _) = True -- The derivative of a regular language A with respect to a character -- c is dA/dc = { s : cs \in A } diff :: Char -> Regexp -> Regexp diff _ (Zero) = zero diff c (Match d) | (c == d) = one diff c (Match d) = zero diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x) diff c (Prod []) = zero diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs) diff c (Prod (x:xs)) = diff c x <*> xs diff c (Not x) = Not (diff c x) diff c (Star x) = diff c x <*> Star x flattenSet :: (Ord a) => Set (Set a) -> Set a flattenSet = Set.fold Set.union Set.empty (/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b x />>= f = flattenSet (Set.map f x) -- The alphabet of all characters that appear in this regexp alphabet :: Regexp -> Set Char alphabet (Zero) = Set.empty alphabet (Sum x) = flattenSet (Set.map alphabet x) alphabet (Prod x) = Set.unions $ map alphabet x alphabet (Not x) = alphabet x alphabet (Star x) = alphabet x alphabet (Match c) = Set.singleton c -- Set of all derivatives of a regular expression (including itself, and higher order derivatives). derivatives :: Regexp -> [Regexp] derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp) where alpha = alphabet exp firstDerivatives x = Set.map (`diff` x) alpha enlarge :: Set Regexp -> Set Regexp -> Set Regexp enlarge new found = if Set.null new then found else let nextNew = (new />>= firstDerivatives) Set.\\ found nextFound = found `Set.union` nextNew in enlarge nextNew nextFound -- A simple finite state machine type data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool } -- Converts a Regexp into a finite state machine by using the derivatives -- with respect to specific characters as the transitions. Essentially at -- each stage we build up a regular expression that the remaining characters -- have to match. Due to Cunning Mathematics, only finitely many such regular -- expressions (up to similarity) result. compile :: Regexp -> FSM compile x = fromJust $ Map.lookup x states where states :: Map Regexp FSM states = Map.fromList $ do re <- derivatives x -- Totally gratuitious use of list monad. :) let trans = do c <- Set.toList $ alphabet re let d = diff c re return (c, fromJust $ Map.lookup d states) let state = State (Map.fromList trans) (del re) return (re, state) runFSM :: FSM -> String -> Bool runFSM x [] = isFinal x runFSM x (c:cs) = case (Map.lookup c $ transitions x) of Nothing -> False Just y -> runFSM y cs matches :: (Match a) => String -> a -> Bool matches cs exp = runFSM (compile $ match exp) cs