Haskell Regular Expression Matcher
Not really intended for serious use. Just a proof of concept.
1 2 module Regexp 3 where 4 5 import Data.Set (Set) 6 import Data.Map (Map) 7 import Monad 8 import List 9 import Maybe 10 import qualified Data.Set as Set 11 import qualified Data.Map as Map 12 13 data Regexp = 14 Zero 15 | Match Char -- matches a single char 16 | Not Regexp -- matches the negation of its argument 17 | Prod [Regexp] -- matches a concatentation of its arguments 18 | Sum (Set Regexp) -- matches either of its arguments 19 | Star Regexp -- matches repetitions of its argument (including 0 repetitions) 20 deriving (Eq, Ord) 21 22 instance Show Regexp where 23 show Zero = "0" 24 show (Match c) = [c] 25 show (Not x) = '~' : show x 26 show (Prod x) = join . (map show) $ x 27 show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")" 28 show (Star x) = "(" ++ show x ++ ")*" 29 30 -- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular 31 -- expressions. 32 class Match a where 33 match :: a -> Regexp 34 35 instance Match Char where 36 match c = Match c 37 38 instance (Match a) => Match [a] where 39 match = con 40 41 instance Match Regexp where 42 match = id 43 44 -- "smart" versions of the constructors, which perform normalisation of the datatype. 45 -- As long as all regular expressions are built up using these and the match instance 46 -- for char we can guarantee that structural equality of terms == similarity. 47 -- This is important to make sure we only generate a finite number of states. 48 zero :: Regexp 49 zero = Zero 50 51 one :: Regexp 52 one = Prod [] 53 54 (<+>) :: (Match a, Match b) => a -> b -> Regexp 55 x <+> y = 56 case (match x, match y) of 57 (Zero, b) -> b 58 (a, Zero) -> a 59 (Sum a, Sum b) -> Sum (Set.union a b) 60 (Sum a, b) -> Sum (Set.insert b a) 61 (a, Sum b) -> Sum (Set.insert a b) 62 (a, b) -> Sum $ Set.fromList [a, b] 63 64 oneOf :: (Match a) => [a] -> Regexp 65 oneOf = foldr (<+>) zero 66 67 (<*>) :: (Match a, Match b) => a -> b -> Regexp 68 u <*> v = 69 case (match u, match v) of 70 (Zero, _) -> zero 71 (_, Zero) -> zero 72 (Prod x, Prod y) -> Prod (x ++ y) 73 (Prod x, y) -> Prod (x ++ [y]) 74 (x, Prod y) -> Prod (x : y) 75 (x, y) -> Prod [x, y] 76 77 con :: (Match a) => [a] -> Regexp 78 con = foldr (<*>) one 79 80 neg :: (Match a) => a -> Regexp 81 neg x = 82 case (match x) of 83 (Not y) -> y 84 y -> Not y 85 86 star :: (Match a) => a -> Regexp 87 star x = 88 case (match x) of 89 (Zero) -> Zero 90 (Star y) -> Star y 91 y -> Star y 92 93 -- Returns if the regex matches the empty string. 94 del :: Regexp -> Bool 95 del (Zero) = False 96 del (Sum x) = or . map del $ Set.toList x 97 del (Prod x) = and . map del $ x 98 del (Match _) = False 99 del (Not x) = not $ del x; 100 del (Star _) = True 101 102 -- The derivative of a regular language A with respect to a character 103 -- c is dA/dc = { s : cs \in A } 104 diff :: Char -> Regexp -> Regexp 105 diff _ (Zero) = zero 106 diff c (Match d) | (c == d) = one 107 diff c (Match d) = zero 108 diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x) 109 diff c (Prod []) = zero 110 diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs) 111 diff c (Prod (x:xs)) = diff c x <*> xs 112 diff c (Not x) = Not (diff c x) 113 diff c (Star x) = diff c x <*> Star x 114 115 flattenSet :: (Ord a) => Set (Set a) -> Set a 116 flattenSet = Set.fold Set.union Set.empty 117 118 (/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b 119 x />>= f = flattenSet (Set.map f x) 120 121 -- The alphabet of all characters that appear in this regexp 122 alphabet :: Regexp -> Set Char 123 alphabet (Zero) = Set.empty 124 alphabet (Sum x) = flattenSet (Set.map alphabet x) 125 alphabet (Prod x) = Set.unions $ map alphabet x 126 alphabet (Not x) = alphabet x 127 alphabet (Star x) = alphabet x 128 alphabet (Match c) = Set.singleton c 129 130 -- Set of all derivatives of a regular expression (including itself, and higher order derivatives). 131 derivatives :: Regexp -> [Regexp] 132 derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp) 133 where 134 alpha = alphabet exp 135 firstDerivatives x = Set.map (`diff` x) alpha 136 enlarge :: Set Regexp -> Set Regexp -> Set Regexp 137 enlarge new found = 138 if Set.null new 139 then found 140 else 141 let nextNew = (new />>= firstDerivatives) Set.\\ found 142 nextFound = found `Set.union` nextNew 143 in enlarge nextNew nextFound 144 145 -- A simple finite state machine type 146 data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool } 147 148 -- Converts a Regexp into a finite state machine by using the derivatives 149 -- with respect to specific characters as the transitions. Essentially at 150 -- each stage we build up a regular expression that the remaining characters 151 -- have to match. Due to Cunning Mathematics, only finitely many such regular 152 -- expressions (up to similarity) result. 153 compile :: Regexp -> FSM 154 compile x = fromJust $ Map.lookup x states 155 where 156 states :: Map Regexp FSM 157 states = Map.fromList $ 158 do re <- derivatives x -- Totally gratuitious use of list monad. :) 159 let trans = do c <- Set.toList $ alphabet re 160 let d = diff c re 161 return (c, fromJust $ Map.lookup d states) 162 let state = State (Map.fromList trans) (del re) 163 return (re, state) 164 165 runFSM :: FSM -> String -> Bool 166 runFSM x [] = isFinal x 167 runFSM x (c:cs) = case (Map.lookup c $ transitions x) of 168 Nothing -> False 169 Just y -> runFSM y cs 170 171 matches :: (Match a) => String -> a -> Bool 172 matches cs exp = runFSM (compile $ match exp) cs 173