From Regular Expressions To NFA by way of Abstract Syntax Trees: Thompsons Construction
Regular expressions and NFA have been an active area of study in theoretical computer science since the late 1950's. As such, there is a large body of work exploring a multitude of ways to evaluate regular expressions for the purpose of pattern matching. In previous posts I have discussed one such algorithm presented by Robert Sedgewick in his book "Algorithms" (4th ed). That algorithm, which while being conceptually simpler to understand and implement than some other approaches is also more difficult to modify or extend. Despite this, it is as good algorithm, and one that is not prone to the catastrophic backtracking issue present in some other popular implementations. As I've said before though, when it comes to data structures and algorithms, variety is the spice of life and so I've decided to explore some other methods of evaluating regular expressions matching.
Non-Deterministic Finite Automata
typedef int State;
class Edge {
public:
virtual Token getLabel() = 0;
virtual bool matches(char c) = 0;
virtual bool isEpsilon() = 0;
virtual ~Edge() { }
};
struct Transition {
State from;
State to;
Edge* edge;
Transition(State s, State t, Edge* m) {
from = s; to = t; edge = m;
}
};
class NFA {
private:
State start;
State accept;
unordered_map<State, vector<Transition>> states;
public:
NFA();
NFA(const NFA& nfa);
void makeState(State name) {
if (states.find(name) == states.end()) {
states.insert(make_pair(name, vector<Transition>()));
}
}
void setStart(State ss) {
start = ss;
}
void setAccept(State as) {
accept = as;
}
State getStart() {
return start;
}
State getAccept() {
return accept;
}
void addTransition(Transition t) {
states[t.from].push_back(t);
}
int size() {
return states.size();
}
unordered_map<State, vector<Transition>>& getStates() {
return states;
}
vector<Transition>& getTransitions(State state) {
return states[state];
}
bool match(string text);
NFA& operator=(const NFA& nfa);
};
class CharEdge : public Edge {
private:
char label;
public:
CharEdge(char c) {
label = c;
}
~CharEdge() {
}
bool isEpsilon() {
return false;
}
bool matches(char c) {
return label == c;
}
char getLabel() {
return label;
}
};
class EpsilonEdge : public Edge {
public:
EpsilonEdge() { }
~EpsilonEdge() { }
bool matches(char c) {
return true;
}
bool isEpsilon() {
return true;
}
char getLabel() {
return '&';
}
};
void testNFA() {
NFA nfa;
nfa.makeState(0); nfa.makeState(1); nfa.makeState(2); nfa.makeState(3);
nfa.setStart(0);
nfa.setAccept(3);
nfa.addTransition(Transition(0, 1, new CharEdge('a'))); //start -> a
nfa.addTransition(Transition(1, 2, new CharEdge('b'))); // a -> b
nfa.addTransition(Transition(2, 2, new CharEdge('b'))); // b -> b
nfa.addTransition(Transition(2, 3, new EpsilonEdge())); // b -> accept
cout<<nfa.match("abb")<<endl;
cout<<nfa.match("aab")<<endl;
cout<<nfa.match("ab")<<endl;
cout<<nfa.match("a")<<endl;
}
The solution to automating the constuction is to convert a Regular expression string to an abstract syntax tree, and then build our NFA automatically from the tree. We'll return to implementing match() once we have our NFA constructed from an AST, so lets get to it.
Building an AST from a Regular Expression
class Parser {
private:
bool isOp(char c);
int precedence(char c);
int leftAssociative(char c);
string addConcatOp(string& str);
string makePostfix(string& str);
RegularExpression* makeTree(string& postfix);
public:
Parser();
RegularExpression* parse();
};
int precedence(char c) {
switch (c) {
case '*': return 50;
case '+': return 50;
case '?': return 50;
case '@': return 30;
case '|': return 20;
default:
break;
}
return 10;
}
bool leftAssociative(char c) {
switch (c) {
case '*':
case '+':
case '?':
case '|':
case '@':
return true;
default:
break;
}
return false;
}
string addConcatOp(string str) {
string fixed;
for (int i = 0; i < str.length(); i++) {
fixed.push_back(str[i]);
if (str[i] == '(' || str[i] == '|')
continue;
if (i+1 < str.length()) {
char p = str[i+1];
if (p == '|' || p == '*' || p == '+' || p == ')')
continue;
fixed.push_back('@');
}
}
return fixed;
}
string makePostfix(string& str) {
str = addConcatOp(str);
cout<<"Inserting explicit concat operators: "<<str<<endl;
string postfix;
for (int i = 0; i < str.length(); i++) {
if (str[i] == '(') {
ops.push(str[i]);
} else if (str[i] == '|' || str[i] == '*' || str[i] == '+' || str[i] == '@' || str[i] == '?') {
if (precedence(str[i]) < precedence(ops.top()) || (precedence(str[i]) == precedence(ops.top()) && leftAssociative(str[i]))) {
char c = ops.pop();
postfix.push_back(c);
ops.push(str[i]);
} else {
ops.push(str[i]);
}
} else if (str[i] == ')') {
while (!ops.empty()) {
char c = ops.pop();
if (c == '(')
break;
else postfix.push_back(c);
}
} else {
postfix.push_back(str[i]);
}
}
while (!ops.empty()) {
char c = ops.pop();
if (c != '(')
postfix.push_back(c);
}
return postfix;
}
class RegularExpression {
public:
virtual char getSymbol() = 0;
virtual RegularExpression* getLeft() = 0;
virtual RegularExpression* getRight() = 0;
};
class ExpressionLiteral : public RegularExpression {
private:
char symbol;
public:
ExpressionLiteral(char sym) { symbol = sym; }
RegularExpression* getLeft() {
return nullptr;
}
RegularExpression* getRight() {
return nullptr;
}
char getSymbol() {
return symbol;
}
};
class ExpressionOperator : public RegularExpression {
private:
RegularExpression* left;
RegularExpression* right;
char symbol;
public:
ExpressionOperator(char c, RegularExpression* ll, RegularExpression* rr) {
symbol = c;
left = ll;
right = rr;
}
char getSymbol() {
return symbol;
}
RegularExpression* getLeft() {
return left;
}
RegularExpression* getRight() {
return right;
}
};
RegularExpression* makeTree(string postfix) {
Stack<RegularExpression*> sf;
for (char c : postfix) {
cout<<"Processing: "<<c<<" ";
if (!isOp(c)) {
cout<<"Alphabet."<<endl;
sf.push(new ExpressionLiteral(c));
} else {
cout<<"Operator."<<endl;
auto right = sf.empty() ? nullptr:sf.pop();
auto left = sf.empty() ? nullptr:sf.pop();
sf.push(new ExpressionOperator(c, left, right));
}
}
return sf.pop();
}
NFA Construction Of Regular Expression
int label = 0;
int makeStateLabel() {
return label++;
}
void initNextNFA(NFA& nnfa) {
int nstart = makeStateLabel();
nnfa.makeState(nstart);
nnfa.setStart(nstart);
int nend = makeStateLabel();
nnfa.makeState(nend);
nnfa.setAccept(nend);
}
It should also be mentioned that when I say we are "wiring NFA's together" I'm not implying that we are some how linking NFA objects together - we're not. Instead, we merge NFA's by copying their transitions into a new NFA and throwing the old one away:
void copyTransitions(NFA& nnfa, NFA onfa) {
for (auto state : onfa.getStates()) {
nnfa.makeState(state.first);
for (auto trans : onfa.getTransitions(state.first)) {
nnfa.addTransition(trans);
}
}
}
NFA singleTransitionNFA(Edge* edge) {
NFA nfa;
initNextNFA(nfa);
nfa.addTransition(Transition(nfa.getStart(), nfa.getAccept(), edge));
return nfa;
}
NFA emptyExpr() {
return singleTransitionNFA(new EpsilonEdge());
}
NFA atomicNFA(char c) {
return singleTransitionNFA(new CharEdge(c));
}
NFA concatNFA(NFA first, NFA second) {
NFA nnfa;
copyTransitions(nnfa, first);
copyTransitions(nnfa, second);
nnfa.setStart(first.getStart());
nnfa.setAccept(second.getAccept());
nnfa.addTransition(Transition(first.getAccept(), second.getStart(), new EpsilonEdge()));
return nnfa;
}
We begin by creating a new starting state. From the new starting state we create two epsilon transitions which connect the new start state, to the start states of our two choices. We then create a new accept state and connect the accept state of our respective automata with the new one with epsilon changes.
NFA alternateNFA(NFA first, NFA second) {
//create new NFA with start and end state.
NFA nnfa;
initNextNFA(nnfa);
//copy states and transitions from both NFA's to new NFA
copyTransitions(nnfa, first);
copyTransitions(nnfa, second);
//Add new E-transitions from new start state to first and second NFAs
nnfa.addTransition(Transition(nnfa.getStart(), first.getStart(), new EpsilonEdge()));
nnfa.addTransition(Transition(nnfa.getStart(), second.getStart(), new EpsilonEdge()));
//Add new E-transitions from first and second accept state to new accept state.
nnfa.addTransition(Transition(first.getAccept(), nnfa.getAccept(), new EpsilonEdge()));
nnfa.addTransition(Transition(second.getAccept(), nnfa.getAccept(), new EpsilonEdge()));
return nnfa;
}
NFA kleeneStar(NFA torepeat) {
NFA nnfa;
initNextNFA(nnfa);
//create inner loop
torepeat.addTransition(Transition(torepeat.getAccept(), nnfa.getStart(), new EpsilonEdge()));
copyTransitions(nnfa, torepeat);
//wire new start & accept state with E-transitions
nnfa.addTransition(Transition(nnfa.getStart(), torepeat.getStart(), new EpsilonEdge()));
nnfa.addTransition(Transition(torepeat.getAccept(), nnfa.getAccept(), new EpsilonEdge()));
//create outter loop
nnfa.addTransition(Transition(nnfa.getStart(), nnfa.getAccept(), new EpsilonEdge()));
return nnfa;
}
NFA kleenePlus(NFA torepeat) {
NFA nnfa;
initNextNFA(nnfa);
//create inner loop
torepeat.addTransition(Transition(torepeat.getAccept(), nnfa.getStart(), new EpsilonEdge()));
copyTransitions(nnfa, torepeat);
//wire new start & accept state with E-transitions
nnfa.addTransition(Transition(nnfa.getStart(), torepeat.getStart(), new EpsilonEdge()));
nnfa.addTransition(Transition(torepeat.getAccept(), nnfa.getAccept(), new EpsilonEdge()));
return nnfa;
}
NFA zeroOrOnce(NFA onfa) {
NFA nnfa;
initNextNFA(nnfa);
copyTransitions(nnfa, onfa);
//wire in match choice
nnfa.addTransition(Transition(nnfa.getStart(), onfa.getStart(), new EpsilonEdge()));
nnfa.addTransition(Transition(onfa.getAccept(), nnfa.getAccept(), new EpsilonEdge()));
//wire in epsilon choice.
nnfa.addTransition(Transition(nnfa.getStart(), nnfa.getAccept(), new EpsilonEdge()));
return nnfa;
}
Now all that remains is to take our AST and output the fully constructed NFA! And with the above procedures, all it takes is a simple post-order traversal of our tree with a stack to hold our intermediate subexpressions.
class NFACompiler {
private:
Stack<NFA> nfaStack;
NFA buildOperatorNFA(RegularExpression* ast) {
switch (ast->getSymbol()) {
case '@': {
NFA b = nfaStack.pop();
NFA a = nfaStack.pop();
return concatNFA(a, b);
}
break;
case '|': {
NFA b = nfaStack.pop();
NFA a = nfaStack.pop();
return alternateNFA(a, b);
}
case '*': {
NFA a = nfaStack.pop();
return kleeneNFA(a, false);
}
case '+': {
NFA a = nfaStack.pop();
return kleeneNFA(a, true);
}
case '?': {
NFA a = nfaStack.pop();
return zeroOrOnce(a);
}
default:
break;
}
return NFA();
}
void gen_nfa(RegularExpression* ast) {
if (ast != nullptr) {
gen_nfa(ast->getLeft());
gen_nfa(ast->getRight());
if (!isOp(ast->getSymbol())) {
nfaStack.push(atomicNFA(ast->getSymbol()));
} else {
nfaStack.push(buildOperatorNFA(ast));
}
}
}
public:
NFACompiler() { }
NFA compile(string pattern) {
Parser parser;
string re = "(" + pattern + ")";
RegularExpression* ast = parser.parse(re);
gen_nfa(ast);
return nfaStack.pop();
}
};
Now, finally, we're ready to simulate the NFA. So lets get back to talking about impelementing match().
Simulating the NFA for pattern matching
struct Node {
int strPos;
State state;
unordered_set<Transition> epsHistory;
Node(int sp, State s, unordered_set<Transition> t) : strPos(sp), state(s), epsHistory(t) { }
};
bool matchBT(string text) {
unordered_set<Transition> epsHistory;
Queue<Node> sf;
sf.push(Node(0, start, epsHistory));
while (!sf.empty()) {
int strPos = sf.top().strPos;
State currState = sf.top().state;
epsHistory = sf.top().epsHistory;
sf.pop();
char input = text[strPos];
if (accept == currState) {
return true;
}
for (Transition t : states[currState]) {
if ((t.edge->matches(input) || t.edge->matches('.')) || t.edge->isEpsilon()) {
if (t.edge->isEpsilon()) {
if (epsHistory.find(t) != epsHistory.end()) {
continue;
}
epsHistory.insert(t);
sf.push(Node(strPos, t.to, epsHistory));
} else {
epsHistory.clear();
sf.push(Node(strPos + 1, t.to, epsHistory));
}
}
cout<<endl;
}
}
return false;
}
To avoid getting lost in loops of Epsilon transitions, we simply cache the epsilon transitions we've already crossed, and if we encounter them again we skip them. We must be careful to clear this cache however whenever we cross a non-Epsilon transition.
Allright, let's see it in action:
#include <iostream>
#include "compiler.hpp"
using std::cout;
using std::endl;
int main(int argc, char* argv[]) {
NFACompiler compiler;
NFA nfa = compiler.compile("(a*b|ac)d");
cout<<"---------------------"<<endl;
cout<<nfa.matchBT("aaaaaabd")<<endl;
return 0;
}
Parsing: ((a*b|ac)d)
Inserting explicit concat operators: ((a*@b|a@c)@d)
Postfix: a*b@ac@|d@
---------------------
Attempting to match: aaaaabd, Start state: 10, Accept state: 13
State: 10, Input: a
10-(&)->6
10-(&)->2
State: 2, Input: a
2-(&)->3
2-(&)->0
State: 0, Input: a
0-(a)->1
State: 1, Input: a
1-(&)->3
1-(&)->2
State: 2, Input: a
2-(&)->3
2-(&)->0
State: 0, Input: a
0-(a)->1
State: 1, Input: a
1-(&)->3
1-(&)->2
State: 2, Input: a
2-(&)->3
2-(&)->0
State: 0, Input: a
0-(a)->1
State: 1, Input: a
1-(&)->3
1-(&)->2
State: 2, Input: a
2-(&)->3
2-(&)->0
State: 0, Input: a
0-(a)->1
State: 1, Input: a
1-(&)->3
1-(&)->2
State: 2, Input: a
2-(&)->3
2-(&)->0
State: 0, Input: a
0-(a)->1
State: 1, Input: b
1-(&)->3
1-(&)->2
State: 2, Input: b
2-(&)->3
2-(&)->0
State: 0, Input: b
Dead end.
State: 3, Input: b
3-(&)->4
State: 4, Input: b
4-(b)->5
State: 5, Input: d
5-(&)->11
State: 11, Input: d
11-(&)->12
State: 12, Input: d
12-(d)->13
State: 13, Input:
Found Accept State.
1
As you can see, it works as advertised, and if we were so inclined, we could call it quits here. But if you're anything like me then that whole "catastrophic backtracking" thing is probably sitting in the corner of your mind waiting to rain on your parade. In that case, allow me to tell you about a slightly more complicated way of simulating the NFA - one that doesnt require backtracking.
A Better Way
As I previously mentioned, a straight depth first search isnt the only way to simulate the NFA - it's just the conceptually simplest. A faster method is the one originally described by Thompson and used in awk and grep. That algorithm, called "Thompsons Algorithm" (noticing a pattern here?) uses power set construction to simulate the NFA, And if you squint just right, you might just notice what looks like a depth first search hidden in there....
unordered_set<State> move(unordered_set<State> clist, char ch) {
unordered_set<State> nlist;
for (State s : clist) {
for (Transition t : states[s]) {
if (t.edge->matches(ch) || t.edge->matches('.')) {
if (t.edge->isEpsilon() == false)
nlist.insert(t.to);
}
}
}
return nlist;
}
unordered_set<State> e_closure(unordered_set<State> clist) {
unordered_set<State> nlist = clist;
Stack<State> sf;
for (State s : clist)
sf.push(s);
while (!sf.empty()) {
State s = sf.pop();
for (Transition t : states[s]) {
if (t.edge->isEpsilon()) {
if (nlist.find(t.to) == nlist.end()) {
nlist.insert(t.to);
sf.push(t.to);
}
}
}
}
return nlist;
}
bool match(string text) {
set<State> curr, next;
next.insert(start);
curr = e_closure(next);
for (int i = 0; i < text.length(); i++) {
next = move(curr, text[i]);
curr = e_closure(next);
}
return curr.find(accept) != curr.end();
}
So how does thompsons algorithm work? It begins by initializing the state list with all of the states reachable from the starting state by traversing only epsilon transitions - thats what the e_closure() method does. Next, for each character in the string we are searching, we find the transitions on that character which are present in our current state list - this is the move() procedure. Next, we gather all of states reachable via epsilon transitions from those states. This repeats for every character in the string until we have consumed all of the input. If, when there is no more input to process, the accept state is in our state list, then the pattern matches.
Instead of pursuing one path to its terminus and then backtracking on failure, this algorithm has the effect of running all of the paths in parallel, doing the search step wise from each state so it never needs to back track, as at each step it can determine the correct transition to take (or fail early if there is none). As far as algorithms go, I find this one to be particularly beautiful.
#include <iostream>
#include "compiler.hpp"
using std::cout;
using std::endl;
int main(int argc, char* argv[]) {
NFACompiler compiler;
NFA nfa = compiler.compile("(a*b|ac)d");
cout<<"---------------------"<<endl;
if (nfa.match("aaaaaabd")) cout<<"Matched."<<endl;
return 0;
}
Parsing: ((a*b|ac)d)
Inserting explicit concat operators: ((a*@b|a@c)@d)
Postfix: a*b@ac@|d@
Processing: a Alphabet.
Processing: * Operator.
Processing: b Alphabet.
Processing: @ Operator.
Processing: a Alphabet.
Processing: c Alphabet.
Processing: @ Operator.
Processing: | Operator.
Processing: d Alphabet.
Processing: @ Operator.
*
a
@
b
|
a
@
c
@
d
---------------------
Init:
10 - (&) ->2
10 - (&) ->6
2 - (&) ->0
2 - (&) ->3
3 - (&) ->4
a:
0 - (a) ->1
6 - (a) ->7
1 - (&) ->2
1 - (&) ->3
3 - (&) ->4
2 - (&) ->0
7 - (&) ->8
a:
0 - (a) ->1
1 - (&) ->2
1 - (&) ->3
3 - (&) ->4
2 - (&) ->0
a:
0 - (a) ->1
1 - (&) ->2
1 - (&) ->3
3 - (&) ->4
2 - (&) ->0
a:
0 - (a) ->1
1 - (&) ->2
1 - (&) ->3
3 - (&) ->4
2 - (&) ->0
a:
0 - (a) ->1
1 - (&) ->2
1 - (&) ->3
3 - (&) ->4
2 - (&) ->0
b:
4 - (b) ->5
5 - (&) ->11
11 - (&) ->12
d:
12 - (d) ->13
Matched.
Oh, and the set of edges thompsons algorithm collects while simulating the NFA? It just so happens to be the same set of edges you'd need to convert your NDFA to a DFA, but that is a subject for another day.
Well, thats what I got. Until next time, Happy Hacking!
Further Reading:
1) Aho, Sethi, Ullman "Compilers: Principles, Techniques, and Tools" Addison-Wesley, 1986
2) Drozdek, Adam "Data Structures And Algorithms In C++", (3rd ed.) Thomson, 2005
3) Sedgewick, R. Wayne, K "Algorithms" (4th ed.), Addison-Wesley, 2011
4) Russ Cox, "Regular Expression Matching Can be Simple and Fast"
Leave A Comment