Minimal regex engine

A few months back, I posted a state machine for review. Thanks to the feedback, I was able to greatly simplify the implementation. I am posting the revised version, together with the Regex class which calls it.

I think I'm mainly looking for feedback on structure. The relationship between my Node class and StateMachine class feels a little tangled; I'm not always sure which method ought to belong to which class. I think the way I communicate the next token of my lexer is also cumbersome.

state_machine.py

class Node:

    def __init__(self, value):

        self.value = value

        self.children = set()



    def empty(self):

        return self.value == ''



    def add_child(self, other):

        self.children.add(other)



    def find_parent_of_terminal(self, terminal):

        """

        We assume that there shall only be one node leading to the terminal

        and that there is only one terminal

        """

        visited = set()

        to_explore = {self}

        while to_explore:

            current = to_explore.pop()

            visited.add(current)

            if terminal in current.children:

                # If this fails, then there is a bug in union, concat, or kleene

                assert len(current.children) == 1

                return current

            to_explore.update({node for node in current.children 

                               if node not in visited})

        return None



    def leads_to(self, value):

        """

        Return True iff argument can be reached by traversing empty nodes

        """

        return bool(self._get_node_if_reachable(value))



    def _get_node_if_reachable(self, value):

        for node in self.children:

            while node and node.empty():

                if node == value:

                    return node

                node = node._get_node_if_reachable(value)

        return None



    def __repr__(self):

        result = '{} : ['.format(self.value)

        for node in self.children:

            result += str(node.value) + ', '

        result += ']n'



        return result





def EmptyNode():

    return Node('')





class StateMachine:



    def __init__(self):

        self.initial = EmptyNode()

        self.terminal = EmptyNode()



    def __repr__(self):

        return str(self.initial)



    @staticmethod

    def from_string(source):

        dfa = StateMachine()

        nodes = [Node(char) for char in source]



        dfa.initial.add_child(nodes[0])

        for i in range(len(source) - 1):

            nodes[i].add_child(nodes[i + 1])

        nodes[-1].add_child(dfa.terminal)

        return dfa



    @staticmethod

    def from_set(chars):

        dfa = StateMachine()

        first = EmptyNode()

        penultimate = EmptyNode()

        dfa.initial.children = {first}



        for char in chars:

            char_node = Node(char)

            first.add_child(char_node)

            char_node.add_child(penultimate)



        penultimate.add_child(dfa.terminal)

        return dfa



    def concat(self, other):

        other.initial.find_parent_of_terminal(other.terminal).children = {self.terminal}

        self.initial.find_parent_of_terminal(self.terminal).children = other.initial.children



        return self



    def union(self, other):

        self.initial.children.update(other.initial.children)



        this_last = self.initial.find_parent_of_terminal(self.terminal)

        other_last = other.initial.find_parent_of_terminal(other.terminal)

        empty_node = EmptyNode()

        empty_node.add_child(self.terminal)

        this_last.children = {empty_node}

        other_last.children = {empty_node}

        return self



    def kleene(self):

        penultimate_node = self.initial.find_parent_of_terminal(self.terminal)

        dummy = EmptyNode()

        penultimate_node.children = {dummy}

        dummy.add_child(self.terminal)

        penultimate_node.add_child(self.initial)

        self.initial.add_child(dummy)

        return self



    def _get_next_state(self, nodes, value, visited=None):

        if visited is None:

            visited = set()

        result = set()

        for node in nodes:

            visited.add(node)

            for child in node.children:

                if child.empty() and child not in visited:

                    result.update(self._get_next_state([child], value, visited))

                elif child.value == value:

                    result.add(child)

        return result



    def is_match(self, nodes):

        for node in nodes:

            if node .leads_to(self.terminal):

                return True

        return False



    def match(self, source):

        """

        Match a target string by simulating a NFA

        :param source: string to match

        :return: Matched part of string, or None if no match is found

        """

        result = ''

        last_match = None

        current = {self.initial}

        for char in source:

            next_nodes = self._get_next_state(current, char)

            if next_nodes:

                current = next_nodes

                result += char



                if self.is_match(current):

                    last_match = result

            else:

                break



        if self.is_match(current):

            last_match = result

        return last_match

regex.py

import collections

import enum

import string





from state_machine import StateMachine





class Token(enum.Enum):

    METACHAR = 0

    CHAR = 1

    ERROR = 2





class LexResult(collections.namedtuple('LexResult', ['token', 'lexeme'])):

    def __bool__(self):

        return self.token != Token.ERROR





class RegexLexer(object):

    metachars = '-|^().*'



    def __init__(self, pattern: str):

        self._pattern = pattern

        self._pos = 0

        self._stack = 



    def peek(self) -> LexResult:

        if self._pos >= len(self._pattern):

            return LexResult(Token.ERROR, '')

        next_char = self._pattern[self._pos]

        if next_char in self.metachars:

            token = Token.METACHAR

        else:

            token = Token.CHAR

        return LexResult(token, next_char)



    def _eat_token_type(self, token: Token) -> LexResult:

        next_match = self.peek()

        if next_match.token != token:

            return LexResult(Token.ERROR, next_match.lexeme)

        self._pos += 1

        return next_match



    def _eat_token(self, match: LexResult) -> LexResult:

        next_match = self.peek()

        if next_match == match:

            self._pos += 1

            return next_match

        return LexResult(Token.ERROR, next_match.lexeme)



    def mark(self):

        self._stack.append(self._pos)



    def clear(self):

        self._stack.pop()



    def backtrack(self):

        self._pos = self._stack.pop()



    def eat_char(self, char=''):

        if char:

            return self._eat_token(LexResult(Token.CHAR, char))

        return self._eat_token_type(Token.CHAR)



    def eat_metachar(self, metachar):

        return self._eat_token(LexResult(Token.METACHAR, metachar))





class Regex(object):

    CHARACTERS = string.printable



    def __init__(self, pattern: str):

        """

        Initialize regex by compiling provided pattern

        """

        self._lexer = RegexLexer(pattern)

        self._state_machine = self.parse()



    def match(self, text: str) -> str:

        """

        Match text according to provided pattern.

        Returns matched substring if a match was found,

        or None otherwise

        """

        assert self._state_machine

        return self._state_machine.match(text)



    def parse(self):

        nfa = self.parse_simple_re()

        if not nfa:

            return None

        while True:

            self._lexer.mark()

            if not self._lexer.eat_metachar('|'):

                self._lexer.backtrack()

                return nfa

            next_nfa = self.parse_simple_re()

            if not next_nfa:

                self._lexer.backtrack()

                return nfa

            nfa = nfa.union(next_nfa)

            self._lexer.clear()



    def parse_simple_re(self):

        """

        <simple-re> = <basic-re>+

        """

        nfa = self.parse_basic_re()

        if not nfa:

            return None



        while True:

            next_nfa = self.parse_basic_re()

            if not next_nfa:

                break

            nfa = nfa.concat(next_nfa)

        return nfa



    def parse_basic_re(self):

        """

        <elementary-re> "*" | <elementary-re> "+" | <elementary-re>

        """

        nfa = self.parse_elementary_re()

        if not nfa:

            return None

        next_match = self._lexer.peek()

        if not next_match or next_match.token != Token.METACHAR:

            return nfa

        if next_match.lexeme == '*':

            self._lexer.eat_metachar('*')

            return nfa.kleene()

        if next_match.lexeme == '+':

            self._lexer.eat_metachar('+')

            return nfa.union(nfa.kleene())

        return nfa



    def parse_elementary_re(self):

        """

        <elementary-RE> = <group> | <any> | <char> | <set>

        :return: DFA

        """

        self._lexer.mark()

        nfa = self.parse_group()

        if nfa:

            self._lexer.clear()

            return nfa



        self._lexer.backtrack()

        if self._lexer.eat_metachar('.'):

            return StateMachine.from_set({x for x in self.CHARACTERS})



        char = self._lexer.eat_char()

        if char:

            return StateMachine.from_string(char.lexeme)



        set_chars = self.parse_set()

        if not set_chars:

            return None

        return StateMachine.from_set(set_chars)



    def parse_group(self):

        """

        <group> = "(" <RE> ")"

        :return: DFA

        """

        if not self._lexer.eat_metachar('('):

            return None

        state_machine = self.parse()

        if not state_machine:

            return None

        if not self._lexer.eat_metachar(')'):

            return None

        return state_machine



    def parse_range(self) -> {str}:

        """

        <range> = <CHAR> "-" <CHAR>

        """

        first = self._lexer.eat_char()

        if not first:

            return set()



        if not self._lexer.eat_metachar('-'):

            return set()



        last = self._lexer.eat_char()

        if not last:

            return set()



        return {chr(x) for x in range(ord(first.lexeme), ord(last.lexeme) + 1)}



    def parse_set_item(self) -> {str}:

        """

        <set item> = <range> | <char>

        """

        self._lexer.mark()

        set_item = self.parse_range()

        if set_item:

            self._lexer.clear()

            return set_item

        self._lexer.backtrack()

        next_item = self._lexer.eat_char()

        return {next_item.lexeme} if next_item else set()



    def parse_set_items(self) -> {str}:

        """

        <set items> = <set item>+

        """

        items = self.parse_set_item()

        if not items:

            return set()

        next_items = self.parse_set_item()

        while next_items:

            items.update(next_items)

            next_items = self.parse_set_item()

        return items



    def parse_positive_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set_items



    def parse_negative_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        if not self._lexer.eat_metachar('^'):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set(string.printable).difference(set_items)



    def parse_set(self) -> {str}:

        """

        Parse something like [a-z9] and return the set of allowed characters

        """

        self._lexer.mark()

        set_items = self.parse_positive_set()

        if set_items:

            self._lexer.clear()

            return set_items

        self._lexer.backtrack()

        return self.parse_negative_set()

Finally, a small set of unit tests to show usage:

import unittest

from state_machine import StateMachine

from regex import Regex





class TestStateMachine(unittest.TestCase):



    def test_union(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.union(StateMachine.from_string('def'))



        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('def'), 'def')

        self.assertIsNone(state_machine.match('de'))



    def test_kleene(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.kleene()



        self.assertEqual(state_machine.match(''), '')

        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('abcabc'), 'abcabc')

        self.assertEqual(state_machine.match('abcDabc'), 'abc')



    def test_concat(self):

        state_machine = StateMachine.from_string('ab')

        state_machine = state_machine.concat(StateMachine.from_string('cd'))



        self.assertEqual(state_machine.match('abcd'), 'abcd')

        self.assertEqual(state_machine.match('abcde'), 'abcd')

        self.assertIsNone(state_machine.match('abc'))





class TestRegex(unittest.TestCase):



    def test_identifier_regex(self):

        regex = Regex('[a-zA-Z_][a-zA-Z0-9_]*')

        self.assertEqual(regex.match('a'), 'a')

        self.assertFalse(regex.match('0'))

        self.assertTrue(regex.match('a0'))

        self.assertEqual(regex.match('a0_3bd'), 'a0_3bd')

        self.assertEqual(regex.match('abd-43'), 'abd')



    def test_parentheses(self):

        regex = Regex('d(ab)*')

        self.assertEqual(regex.match('d'), 'd')

        self.assertEqual(regex.match('dab'), 'dab')

        self.assertEqual(regex.match('daa'), 'd')

        self.assertEqual(regex.match('dabab'), 'dabab')



    def test_union(self):

        regex = Regex('(ab*d)|(AG)')

        self.assertEqual(regex.match('adG'), 'ad')

        self.assertEqual(regex.match('AGfe'), 'AG')





if __name__ == '__main__':

    unittest.main()

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

$begingroup$
Hello, this looks like a very interesting question and a well written piece of code! By any chance, would you have any small example that could be used to test your code ?
$endgroup$
– Josay
Oct 26 '18 at 16:14

$begingroup$
@Josay See edit
$endgroup$
– User319
Oct 26 '18 at 20:16

add a comment |

state_machine.py

class Node:

    def __init__(self, value):

        self.value = value

        self.children = set()



    def empty(self):

        return self.value == ''



    def add_child(self, other):

        self.children.add(other)



    def find_parent_of_terminal(self, terminal):

        """

        We assume that there shall only be one node leading to the terminal

        and that there is only one terminal

        """

        visited = set()

        to_explore = {self}

        while to_explore:

            current = to_explore.pop()

            visited.add(current)

            if terminal in current.children:

                # If this fails, then there is a bug in union, concat, or kleene

                assert len(current.children) == 1

                return current

            to_explore.update({node for node in current.children 

                               if node not in visited})

        return None



    def leads_to(self, value):

        """

        Return True iff argument can be reached by traversing empty nodes

        """

        return bool(self._get_node_if_reachable(value))



    def _get_node_if_reachable(self, value):

        for node in self.children:

            while node and node.empty():

                if node == value:

                    return node

                node = node._get_node_if_reachable(value)

        return None



    def __repr__(self):

        result = '{} : ['.format(self.value)

        for node in self.children:

            result += str(node.value) + ', '

        result += ']n'



        return result





def EmptyNode():

    return Node('')





class StateMachine:



    def __init__(self):

        self.initial = EmptyNode()

        self.terminal = EmptyNode()



    def __repr__(self):

        return str(self.initial)



    @staticmethod

    def from_string(source):

        dfa = StateMachine()

        nodes = [Node(char) for char in source]



        dfa.initial.add_child(nodes[0])

        for i in range(len(source) - 1):

            nodes[i].add_child(nodes[i + 1])

        nodes[-1].add_child(dfa.terminal)

        return dfa



    @staticmethod

    def from_set(chars):

        dfa = StateMachine()

        first = EmptyNode()

        penultimate = EmptyNode()

        dfa.initial.children = {first}



        for char in chars:

            char_node = Node(char)

            first.add_child(char_node)

            char_node.add_child(penultimate)



        penultimate.add_child(dfa.terminal)

        return dfa



    def concat(self, other):

        other.initial.find_parent_of_terminal(other.terminal).children = {self.terminal}

        self.initial.find_parent_of_terminal(self.terminal).children = other.initial.children



        return self



    def union(self, other):

        self.initial.children.update(other.initial.children)



        this_last = self.initial.find_parent_of_terminal(self.terminal)

        other_last = other.initial.find_parent_of_terminal(other.terminal)

        empty_node = EmptyNode()

        empty_node.add_child(self.terminal)

        this_last.children = {empty_node}

        other_last.children = {empty_node}

        return self



    def kleene(self):

        penultimate_node = self.initial.find_parent_of_terminal(self.terminal)

        dummy = EmptyNode()

        penultimate_node.children = {dummy}

        dummy.add_child(self.terminal)

        penultimate_node.add_child(self.initial)

        self.initial.add_child(dummy)

        return self



    def _get_next_state(self, nodes, value, visited=None):

        if visited is None:

            visited = set()

        result = set()

        for node in nodes:

            visited.add(node)

            for child in node.children:

                if child.empty() and child not in visited:

                    result.update(self._get_next_state([child], value, visited))

                elif child.value == value:

                    result.add(child)

        return result



    def is_match(self, nodes):

        for node in nodes:

            if node .leads_to(self.terminal):

                return True

        return False



    def match(self, source):

        """

        Match a target string by simulating a NFA

        :param source: string to match

        :return: Matched part of string, or None if no match is found

        """

        result = ''

        last_match = None

        current = {self.initial}

        for char in source:

            next_nodes = self._get_next_state(current, char)

            if next_nodes:

                current = next_nodes

                result += char



                if self.is_match(current):

                    last_match = result

            else:

                break



        if self.is_match(current):

            last_match = result

        return last_match

regex.py

import collections

import enum

import string





from state_machine import StateMachine





class Token(enum.Enum):

    METACHAR = 0

    CHAR = 1

    ERROR = 2





class LexResult(collections.namedtuple('LexResult', ['token', 'lexeme'])):

    def __bool__(self):

        return self.token != Token.ERROR





class RegexLexer(object):

    metachars = '-|^().*'



    def __init__(self, pattern: str):

        self._pattern = pattern

        self._pos = 0

        self._stack = 



    def peek(self) -> LexResult:

        if self._pos >= len(self._pattern):

            return LexResult(Token.ERROR, '')

        next_char = self._pattern[self._pos]

        if next_char in self.metachars:

            token = Token.METACHAR

        else:

            token = Token.CHAR

        return LexResult(token, next_char)



    def _eat_token_type(self, token: Token) -> LexResult:

        next_match = self.peek()

        if next_match.token != token:

            return LexResult(Token.ERROR, next_match.lexeme)

        self._pos += 1

        return next_match



    def _eat_token(self, match: LexResult) -> LexResult:

        next_match = self.peek()

        if next_match == match:

            self._pos += 1

            return next_match

        return LexResult(Token.ERROR, next_match.lexeme)



    def mark(self):

        self._stack.append(self._pos)



    def clear(self):

        self._stack.pop()



    def backtrack(self):

        self._pos = self._stack.pop()



    def eat_char(self, char=''):

        if char:

            return self._eat_token(LexResult(Token.CHAR, char))

        return self._eat_token_type(Token.CHAR)



    def eat_metachar(self, metachar):

        return self._eat_token(LexResult(Token.METACHAR, metachar))





class Regex(object):

    CHARACTERS = string.printable



    def __init__(self, pattern: str):

        """

        Initialize regex by compiling provided pattern

        """

        self._lexer = RegexLexer(pattern)

        self._state_machine = self.parse()



    def match(self, text: str) -> str:

        """

        Match text according to provided pattern.

        Returns matched substring if a match was found,

        or None otherwise

        """

        assert self._state_machine

        return self._state_machine.match(text)



    def parse(self):

        nfa = self.parse_simple_re()

        if not nfa:

            return None

        while True:

            self._lexer.mark()

            if not self._lexer.eat_metachar('|'):

                self._lexer.backtrack()

                return nfa

            next_nfa = self.parse_simple_re()

            if not next_nfa:

                self._lexer.backtrack()

                return nfa

            nfa = nfa.union(next_nfa)

            self._lexer.clear()



    def parse_simple_re(self):

        """

        <simple-re> = <basic-re>+

        """

        nfa = self.parse_basic_re()

        if not nfa:

            return None



        while True:

            next_nfa = self.parse_basic_re()

            if not next_nfa:

                break

            nfa = nfa.concat(next_nfa)

        return nfa



    def parse_basic_re(self):

        """

        <elementary-re> "*" | <elementary-re> "+" | <elementary-re>

        """

        nfa = self.parse_elementary_re()

        if not nfa:

            return None

        next_match = self._lexer.peek()

        if not next_match or next_match.token != Token.METACHAR:

            return nfa

        if next_match.lexeme == '*':

            self._lexer.eat_metachar('*')

            return nfa.kleene()

        if next_match.lexeme == '+':

            self._lexer.eat_metachar('+')

            return nfa.union(nfa.kleene())

        return nfa



    def parse_elementary_re(self):

        """

        <elementary-RE> = <group> | <any> | <char> | <set>

        :return: DFA

        """

        self._lexer.mark()

        nfa = self.parse_group()

        if nfa:

            self._lexer.clear()

            return nfa



        self._lexer.backtrack()

        if self._lexer.eat_metachar('.'):

            return StateMachine.from_set({x for x in self.CHARACTERS})



        char = self._lexer.eat_char()

        if char:

            return StateMachine.from_string(char.lexeme)



        set_chars = self.parse_set()

        if not set_chars:

            return None

        return StateMachine.from_set(set_chars)



    def parse_group(self):

        """

        <group> = "(" <RE> ")"

        :return: DFA

        """

        if not self._lexer.eat_metachar('('):

            return None

        state_machine = self.parse()

        if not state_machine:

            return None

        if not self._lexer.eat_metachar(')'):

            return None

        return state_machine



    def parse_range(self) -> {str}:

        """

        <range> = <CHAR> "-" <CHAR>

        """

        first = self._lexer.eat_char()

        if not first:

            return set()



        if not self._lexer.eat_metachar('-'):

            return set()



        last = self._lexer.eat_char()

        if not last:

            return set()



        return {chr(x) for x in range(ord(first.lexeme), ord(last.lexeme) + 1)}



    def parse_set_item(self) -> {str}:

        """

        <set item> = <range> | <char>

        """

        self._lexer.mark()

        set_item = self.parse_range()

        if set_item:

            self._lexer.clear()

            return set_item

        self._lexer.backtrack()

        next_item = self._lexer.eat_char()

        return {next_item.lexeme} if next_item else set()



    def parse_set_items(self) -> {str}:

        """

        <set items> = <set item>+

        """

        items = self.parse_set_item()

        if not items:

            return set()

        next_items = self.parse_set_item()

        while next_items:

            items.update(next_items)

            next_items = self.parse_set_item()

        return items



    def parse_positive_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set_items



    def parse_negative_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        if not self._lexer.eat_metachar('^'):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set(string.printable).difference(set_items)



    def parse_set(self) -> {str}:

        """

        Parse something like [a-z9] and return the set of allowed characters

        """

        self._lexer.mark()

        set_items = self.parse_positive_set()

        if set_items:

            self._lexer.clear()

            return set_items

        self._lexer.backtrack()

        return self.parse_negative_set()

Finally, a small set of unit tests to show usage:

import unittest

from state_machine import StateMachine

from regex import Regex





class TestStateMachine(unittest.TestCase):



    def test_union(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.union(StateMachine.from_string('def'))



        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('def'), 'def')

        self.assertIsNone(state_machine.match('de'))



    def test_kleene(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.kleene()



        self.assertEqual(state_machine.match(''), '')

        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('abcabc'), 'abcabc')

        self.assertEqual(state_machine.match('abcDabc'), 'abc')



    def test_concat(self):

        state_machine = StateMachine.from_string('ab')

        state_machine = state_machine.concat(StateMachine.from_string('cd'))



        self.assertEqual(state_machine.match('abcd'), 'abcd')

        self.assertEqual(state_machine.match('abcde'), 'abcd')

        self.assertIsNone(state_machine.match('abc'))





class TestRegex(unittest.TestCase):



    def test_identifier_regex(self):

        regex = Regex('[a-zA-Z_][a-zA-Z0-9_]*')

        self.assertEqual(regex.match('a'), 'a')

        self.assertFalse(regex.match('0'))

        self.assertTrue(regex.match('a0'))

        self.assertEqual(regex.match('a0_3bd'), 'a0_3bd')

        self.assertEqual(regex.match('abd-43'), 'abd')



    def test_parentheses(self):

        regex = Regex('d(ab)*')

        self.assertEqual(regex.match('d'), 'd')

        self.assertEqual(regex.match('dab'), 'dab')

        self.assertEqual(regex.match('daa'), 'd')

        self.assertEqual(regex.match('dabab'), 'dabab')



    def test_union(self):

        regex = Regex('(ab*d)|(AG)')

        self.assertEqual(regex.match('adG'), 'ad')

        self.assertEqual(regex.match('AGfe'), 'AG')





if __name__ == '__main__':

    unittest.main()

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

$begingroup$
Hello, this looks like a very interesting question and a well written piece of code! By any chance, would you have any small example that could be used to test your code ?
$endgroup$
– Josay
Oct 26 '18 at 16:14

$begingroup$
@Josay See edit
$endgroup$
– User319
Oct 26 '18 at 20:16

add a comment |

state_machine.py

class Node:

    def __init__(self, value):

        self.value = value

        self.children = set()



    def empty(self):

        return self.value == ''



    def add_child(self, other):

        self.children.add(other)



    def find_parent_of_terminal(self, terminal):

        """

        We assume that there shall only be one node leading to the terminal

        and that there is only one terminal

        """

        visited = set()

        to_explore = {self}

        while to_explore:

            current = to_explore.pop()

            visited.add(current)

            if terminal in current.children:

                # If this fails, then there is a bug in union, concat, or kleene

                assert len(current.children) == 1

                return current

            to_explore.update({node for node in current.children 

                               if node not in visited})

        return None



    def leads_to(self, value):

        """

        Return True iff argument can be reached by traversing empty nodes

        """

        return bool(self._get_node_if_reachable(value))



    def _get_node_if_reachable(self, value):

        for node in self.children:

            while node and node.empty():

                if node == value:

                    return node

                node = node._get_node_if_reachable(value)

        return None



    def __repr__(self):

        result = '{} : ['.format(self.value)

        for node in self.children:

            result += str(node.value) + ', '

        result += ']n'



        return result





def EmptyNode():

    return Node('')





class StateMachine:



    def __init__(self):

        self.initial = EmptyNode()

        self.terminal = EmptyNode()



    def __repr__(self):

        return str(self.initial)



    @staticmethod

    def from_string(source):

        dfa = StateMachine()

        nodes = [Node(char) for char in source]



        dfa.initial.add_child(nodes[0])

        for i in range(len(source) - 1):

            nodes[i].add_child(nodes[i + 1])

        nodes[-1].add_child(dfa.terminal)

        return dfa



    @staticmethod

    def from_set(chars):

        dfa = StateMachine()

        first = EmptyNode()

        penultimate = EmptyNode()

        dfa.initial.children = {first}



        for char in chars:

            char_node = Node(char)

            first.add_child(char_node)

            char_node.add_child(penultimate)



        penultimate.add_child(dfa.terminal)

        return dfa



    def concat(self, other):

        other.initial.find_parent_of_terminal(other.terminal).children = {self.terminal}

        self.initial.find_parent_of_terminal(self.terminal).children = other.initial.children



        return self



    def union(self, other):

        self.initial.children.update(other.initial.children)



        this_last = self.initial.find_parent_of_terminal(self.terminal)

        other_last = other.initial.find_parent_of_terminal(other.terminal)

        empty_node = EmptyNode()

        empty_node.add_child(self.terminal)

        this_last.children = {empty_node}

        other_last.children = {empty_node}

        return self



    def kleene(self):

        penultimate_node = self.initial.find_parent_of_terminal(self.terminal)

        dummy = EmptyNode()

        penultimate_node.children = {dummy}

        dummy.add_child(self.terminal)

        penultimate_node.add_child(self.initial)

        self.initial.add_child(dummy)

        return self



    def _get_next_state(self, nodes, value, visited=None):

        if visited is None:

            visited = set()

        result = set()

        for node in nodes:

            visited.add(node)

            for child in node.children:

                if child.empty() and child not in visited:

                    result.update(self._get_next_state([child], value, visited))

                elif child.value == value:

                    result.add(child)

        return result



    def is_match(self, nodes):

        for node in nodes:

            if node .leads_to(self.terminal):

                return True

        return False



    def match(self, source):

        """

        Match a target string by simulating a NFA

        :param source: string to match

        :return: Matched part of string, or None if no match is found

        """

        result = ''

        last_match = None

        current = {self.initial}

        for char in source:

            next_nodes = self._get_next_state(current, char)

            if next_nodes:

                current = next_nodes

                result += char



                if self.is_match(current):

                    last_match = result

            else:

                break



        if self.is_match(current):

            last_match = result

        return last_match

regex.py

import collections

import enum

import string





from state_machine import StateMachine





class Token(enum.Enum):

    METACHAR = 0

    CHAR = 1

    ERROR = 2





class LexResult(collections.namedtuple('LexResult', ['token', 'lexeme'])):

    def __bool__(self):

        return self.token != Token.ERROR





class RegexLexer(object):

    metachars = '-|^().*'



    def __init__(self, pattern: str):

        self._pattern = pattern

        self._pos = 0

        self._stack = 



    def peek(self) -> LexResult:

        if self._pos >= len(self._pattern):

            return LexResult(Token.ERROR, '')

        next_char = self._pattern[self._pos]

        if next_char in self.metachars:

            token = Token.METACHAR

        else:

            token = Token.CHAR

        return LexResult(token, next_char)



    def _eat_token_type(self, token: Token) -> LexResult:

        next_match = self.peek()

        if next_match.token != token:

            return LexResult(Token.ERROR, next_match.lexeme)

        self._pos += 1

        return next_match



    def _eat_token(self, match: LexResult) -> LexResult:

        next_match = self.peek()

        if next_match == match:

            self._pos += 1

            return next_match

        return LexResult(Token.ERROR, next_match.lexeme)



    def mark(self):

        self._stack.append(self._pos)



    def clear(self):

        self._stack.pop()



    def backtrack(self):

        self._pos = self._stack.pop()



    def eat_char(self, char=''):

        if char:

            return self._eat_token(LexResult(Token.CHAR, char))

        return self._eat_token_type(Token.CHAR)



    def eat_metachar(self, metachar):

        return self._eat_token(LexResult(Token.METACHAR, metachar))





class Regex(object):

    CHARACTERS = string.printable



    def __init__(self, pattern: str):

        """

        Initialize regex by compiling provided pattern

        """

        self._lexer = RegexLexer(pattern)

        self._state_machine = self.parse()



    def match(self, text: str) -> str:

        """

        Match text according to provided pattern.

        Returns matched substring if a match was found,

        or None otherwise

        """

        assert self._state_machine

        return self._state_machine.match(text)



    def parse(self):

        nfa = self.parse_simple_re()

        if not nfa:

            return None

        while True:

            self._lexer.mark()

            if not self._lexer.eat_metachar('|'):

                self._lexer.backtrack()

                return nfa

            next_nfa = self.parse_simple_re()

            if not next_nfa:

                self._lexer.backtrack()

                return nfa

            nfa = nfa.union(next_nfa)

            self._lexer.clear()



    def parse_simple_re(self):

        """

        <simple-re> = <basic-re>+

        """

        nfa = self.parse_basic_re()

        if not nfa:

            return None



        while True:

            next_nfa = self.parse_basic_re()

            if not next_nfa:

                break

            nfa = nfa.concat(next_nfa)

        return nfa



    def parse_basic_re(self):

        """

        <elementary-re> "*" | <elementary-re> "+" | <elementary-re>

        """

        nfa = self.parse_elementary_re()

        if not nfa:

            return None

        next_match = self._lexer.peek()

        if not next_match or next_match.token != Token.METACHAR:

            return nfa

        if next_match.lexeme == '*':

            self._lexer.eat_metachar('*')

            return nfa.kleene()

        if next_match.lexeme == '+':

            self._lexer.eat_metachar('+')

            return nfa.union(nfa.kleene())

        return nfa



    def parse_elementary_re(self):

        """

        <elementary-RE> = <group> | <any> | <char> | <set>

        :return: DFA

        """

        self._lexer.mark()

        nfa = self.parse_group()

        if nfa:

            self._lexer.clear()

            return nfa



        self._lexer.backtrack()

        if self._lexer.eat_metachar('.'):

            return StateMachine.from_set({x for x in self.CHARACTERS})



        char = self._lexer.eat_char()

        if char:

            return StateMachine.from_string(char.lexeme)



        set_chars = self.parse_set()

        if not set_chars:

            return None

        return StateMachine.from_set(set_chars)



    def parse_group(self):

        """

        <group> = "(" <RE> ")"

        :return: DFA

        """

        if not self._lexer.eat_metachar('('):

            return None

        state_machine = self.parse()

        if not state_machine:

            return None

        if not self._lexer.eat_metachar(')'):

            return None

        return state_machine



    def parse_range(self) -> {str}:

        """

        <range> = <CHAR> "-" <CHAR>

        """

        first = self._lexer.eat_char()

        if not first:

            return set()



        if not self._lexer.eat_metachar('-'):

            return set()



        last = self._lexer.eat_char()

        if not last:

            return set()



        return {chr(x) for x in range(ord(first.lexeme), ord(last.lexeme) + 1)}



    def parse_set_item(self) -> {str}:

        """

        <set item> = <range> | <char>

        """

        self._lexer.mark()

        set_item = self.parse_range()

        if set_item:

            self._lexer.clear()

            return set_item

        self._lexer.backtrack()

        next_item = self._lexer.eat_char()

        return {next_item.lexeme} if next_item else set()



    def parse_set_items(self) -> {str}:

        """

        <set items> = <set item>+

        """

        items = self.parse_set_item()

        if not items:

            return set()

        next_items = self.parse_set_item()

        while next_items:

            items.update(next_items)

            next_items = self.parse_set_item()

        return items



    def parse_positive_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set_items



    def parse_negative_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        if not self._lexer.eat_metachar('^'):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set(string.printable).difference(set_items)



    def parse_set(self) -> {str}:

        """

        Parse something like [a-z9] and return the set of allowed characters

        """

        self._lexer.mark()

        set_items = self.parse_positive_set()

        if set_items:

            self._lexer.clear()

            return set_items

        self._lexer.backtrack()

        return self.parse_negative_set()

Finally, a small set of unit tests to show usage:

import unittest

from state_machine import StateMachine

from regex import Regex





class TestStateMachine(unittest.TestCase):



    def test_union(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.union(StateMachine.from_string('def'))



        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('def'), 'def')

        self.assertIsNone(state_machine.match('de'))



    def test_kleene(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.kleene()



        self.assertEqual(state_machine.match(''), '')

        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('abcabc'), 'abcabc')

        self.assertEqual(state_machine.match('abcDabc'), 'abc')



    def test_concat(self):

        state_machine = StateMachine.from_string('ab')

        state_machine = state_machine.concat(StateMachine.from_string('cd'))



        self.assertEqual(state_machine.match('abcd'), 'abcd')

        self.assertEqual(state_machine.match('abcde'), 'abcd')

        self.assertIsNone(state_machine.match('abc'))





class TestRegex(unittest.TestCase):



    def test_identifier_regex(self):

        regex = Regex('[a-zA-Z_][a-zA-Z0-9_]*')

        self.assertEqual(regex.match('a'), 'a')

        self.assertFalse(regex.match('0'))

        self.assertTrue(regex.match('a0'))

        self.assertEqual(regex.match('a0_3bd'), 'a0_3bd')

        self.assertEqual(regex.match('abd-43'), 'abd')



    def test_parentheses(self):

        regex = Regex('d(ab)*')

        self.assertEqual(regex.match('d'), 'd')

        self.assertEqual(regex.match('dab'), 'dab')

        self.assertEqual(regex.match('daa'), 'd')

        self.assertEqual(regex.match('dabab'), 'dabab')



    def test_union(self):

        regex = Regex('(ab*d)|(AG)')

        self.assertEqual(regex.match('adG'), 'ad')

        self.assertEqual(regex.match('AGfe'), 'AG')





if __name__ == '__main__':

    unittest.main()

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

state_machine.py

class Node:

    def __init__(self, value):

        self.value = value

        self.children = set()



    def empty(self):

        return self.value == ''



    def add_child(self, other):

        self.children.add(other)



    def find_parent_of_terminal(self, terminal):

        """

        We assume that there shall only be one node leading to the terminal

        and that there is only one terminal

        """

        visited = set()

        to_explore = {self}

        while to_explore:

            current = to_explore.pop()

            visited.add(current)

            if terminal in current.children:

                # If this fails, then there is a bug in union, concat, or kleene

                assert len(current.children) == 1

                return current

            to_explore.update({node for node in current.children 

                               if node not in visited})

        return None



    def leads_to(self, value):

        """

        Return True iff argument can be reached by traversing empty nodes

        """

        return bool(self._get_node_if_reachable(value))



    def _get_node_if_reachable(self, value):

        for node in self.children:

            while node and node.empty():

                if node == value:

                    return node

                node = node._get_node_if_reachable(value)

        return None



    def __repr__(self):

        result = '{} : ['.format(self.value)

        for node in self.children:

            result += str(node.value) + ', '

        result += ']n'



        return result





def EmptyNode():

    return Node('')





class StateMachine:



    def __init__(self):

        self.initial = EmptyNode()

        self.terminal = EmptyNode()



    def __repr__(self):

        return str(self.initial)



    @staticmethod

    def from_string(source):

        dfa = StateMachine()

        nodes = [Node(char) for char in source]



        dfa.initial.add_child(nodes[0])

        for i in range(len(source) - 1):

            nodes[i].add_child(nodes[i + 1])

        nodes[-1].add_child(dfa.terminal)

        return dfa



    @staticmethod

    def from_set(chars):

        dfa = StateMachine()

        first = EmptyNode()

        penultimate = EmptyNode()

        dfa.initial.children = {first}



        for char in chars:

            char_node = Node(char)

            first.add_child(char_node)

            char_node.add_child(penultimate)



        penultimate.add_child(dfa.terminal)

        return dfa



    def concat(self, other):

        other.initial.find_parent_of_terminal(other.terminal).children = {self.terminal}

        self.initial.find_parent_of_terminal(self.terminal).children = other.initial.children



        return self



    def union(self, other):

        self.initial.children.update(other.initial.children)



        this_last = self.initial.find_parent_of_terminal(self.terminal)

        other_last = other.initial.find_parent_of_terminal(other.terminal)

        empty_node = EmptyNode()

        empty_node.add_child(self.terminal)

        this_last.children = {empty_node}

        other_last.children = {empty_node}

        return self



    def kleene(self):

        penultimate_node = self.initial.find_parent_of_terminal(self.terminal)

        dummy = EmptyNode()

        penultimate_node.children = {dummy}

        dummy.add_child(self.terminal)

        penultimate_node.add_child(self.initial)

        self.initial.add_child(dummy)

        return self



    def _get_next_state(self, nodes, value, visited=None):

        if visited is None:

            visited = set()

        result = set()

        for node in nodes:

            visited.add(node)

            for child in node.children:

                if child.empty() and child not in visited:

                    result.update(self._get_next_state([child], value, visited))

                elif child.value == value:

                    result.add(child)

        return result



    def is_match(self, nodes):

        for node in nodes:

            if node .leads_to(self.terminal):

                return True

        return False



    def match(self, source):

        """

        Match a target string by simulating a NFA

        :param source: string to match

        :return: Matched part of string, or None if no match is found

        """

        result = ''

        last_match = None

        current = {self.initial}

        for char in source:

            next_nodes = self._get_next_state(current, char)

            if next_nodes:

                current = next_nodes

                result += char



                if self.is_match(current):

                    last_match = result

            else:

                break



        if self.is_match(current):

            last_match = result

        return last_match

regex.py

import collections

import enum

import string





from state_machine import StateMachine





class Token(enum.Enum):

    METACHAR = 0

    CHAR = 1

    ERROR = 2





class LexResult(collections.namedtuple('LexResult', ['token', 'lexeme'])):

    def __bool__(self):

        return self.token != Token.ERROR





class RegexLexer(object):

    metachars = '-|^().*'



    def __init__(self, pattern: str):

        self._pattern = pattern

        self._pos = 0

        self._stack = 



    def peek(self) -> LexResult:

        if self._pos >= len(self._pattern):

            return LexResult(Token.ERROR, '')

        next_char = self._pattern[self._pos]

        if next_char in self.metachars:

            token = Token.METACHAR

        else:

            token = Token.CHAR

        return LexResult(token, next_char)



    def _eat_token_type(self, token: Token) -> LexResult:

        next_match = self.peek()

        if next_match.token != token:

            return LexResult(Token.ERROR, next_match.lexeme)

        self._pos += 1

        return next_match



    def _eat_token(self, match: LexResult) -> LexResult:

        next_match = self.peek()

        if next_match == match:

            self._pos += 1

            return next_match

        return LexResult(Token.ERROR, next_match.lexeme)



    def mark(self):

        self._stack.append(self._pos)



    def clear(self):

        self._stack.pop()



    def backtrack(self):

        self._pos = self._stack.pop()



    def eat_char(self, char=''):

        if char:

            return self._eat_token(LexResult(Token.CHAR, char))

        return self._eat_token_type(Token.CHAR)



    def eat_metachar(self, metachar):

        return self._eat_token(LexResult(Token.METACHAR, metachar))





class Regex(object):

    CHARACTERS = string.printable



    def __init__(self, pattern: str):

        """

        Initialize regex by compiling provided pattern

        """

        self._lexer = RegexLexer(pattern)

        self._state_machine = self.parse()



    def match(self, text: str) -> str:

        """

        Match text according to provided pattern.

        Returns matched substring if a match was found,

        or None otherwise

        """

        assert self._state_machine

        return self._state_machine.match(text)



    def parse(self):

        nfa = self.parse_simple_re()

        if not nfa:

            return None

        while True:

            self._lexer.mark()

            if not self._lexer.eat_metachar('|'):

                self._lexer.backtrack()

                return nfa

            next_nfa = self.parse_simple_re()

            if not next_nfa:

                self._lexer.backtrack()

                return nfa

            nfa = nfa.union(next_nfa)

            self._lexer.clear()



    def parse_simple_re(self):

        """

        <simple-re> = <basic-re>+

        """

        nfa = self.parse_basic_re()

        if not nfa:

            return None



        while True:

            next_nfa = self.parse_basic_re()

            if not next_nfa:

                break

            nfa = nfa.concat(next_nfa)

        return nfa



    def parse_basic_re(self):

        """

        <elementary-re> "*" | <elementary-re> "+" | <elementary-re>

        """

        nfa = self.parse_elementary_re()

        if not nfa:

            return None

        next_match = self._lexer.peek()

        if not next_match or next_match.token != Token.METACHAR:

            return nfa

        if next_match.lexeme == '*':

            self._lexer.eat_metachar('*')

            return nfa.kleene()

        if next_match.lexeme == '+':

            self._lexer.eat_metachar('+')

            return nfa.union(nfa.kleene())

        return nfa



    def parse_elementary_re(self):

        """

        <elementary-RE> = <group> | <any> | <char> | <set>

        :return: DFA

        """

        self._lexer.mark()

        nfa = self.parse_group()

        if nfa:

            self._lexer.clear()

            return nfa



        self._lexer.backtrack()

        if self._lexer.eat_metachar('.'):

            return StateMachine.from_set({x for x in self.CHARACTERS})



        char = self._lexer.eat_char()

        if char:

            return StateMachine.from_string(char.lexeme)



        set_chars = self.parse_set()

        if not set_chars:

            return None

        return StateMachine.from_set(set_chars)



    def parse_group(self):

        """

        <group> = "(" <RE> ")"

        :return: DFA

        """

        if not self._lexer.eat_metachar('('):

            return None

        state_machine = self.parse()

        if not state_machine:

            return None

        if not self._lexer.eat_metachar(')'):

            return None

        return state_machine



    def parse_range(self) -> {str}:

        """

        <range> = <CHAR> "-" <CHAR>

        """

        first = self._lexer.eat_char()

        if not first:

            return set()



        if not self._lexer.eat_metachar('-'):

            return set()



        last = self._lexer.eat_char()

        if not last:

            return set()



        return {chr(x) for x in range(ord(first.lexeme), ord(last.lexeme) + 1)}



    def parse_set_item(self) -> {str}:

        """

        <set item> = <range> | <char>

        """

        self._lexer.mark()

        set_item = self.parse_range()

        if set_item:

            self._lexer.clear()

            return set_item

        self._lexer.backtrack()

        next_item = self._lexer.eat_char()

        return {next_item.lexeme} if next_item else set()



    def parse_set_items(self) -> {str}:

        """

        <set items> = <set item>+

        """

        items = self.parse_set_item()

        if not items:

            return set()

        next_items = self.parse_set_item()

        while next_items:

            items.update(next_items)

            next_items = self.parse_set_item()

        return items



    def parse_positive_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set_items



    def parse_negative_set(self) -> {str}:

        if not self._lexer.eat_metachar('['):

            return set()

        if not self._lexer.eat_metachar('^'):

            return set()

        set_items = self.parse_set_items()

        if not set_items:

            return set()

        if not self._lexer.eat_metachar(']'):

            return set()

        return set(string.printable).difference(set_items)



    def parse_set(self) -> {str}:

        """

        Parse something like [a-z9] and return the set of allowed characters

        """

        self._lexer.mark()

        set_items = self.parse_positive_set()

        if set_items:

            self._lexer.clear()

            return set_items

        self._lexer.backtrack()

        return self.parse_negative_set()

Finally, a small set of unit tests to show usage:

import unittest

from state_machine import StateMachine

from regex import Regex





class TestStateMachine(unittest.TestCase):



    def test_union(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.union(StateMachine.from_string('def'))



        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('def'), 'def')

        self.assertIsNone(state_machine.match('de'))



    def test_kleene(self):

        state_machine = StateMachine.from_string('abc')

        state_machine = state_machine.kleene()



        self.assertEqual(state_machine.match(''), '')

        self.assertEqual(state_machine.match('abc'), 'abc')

        self.assertEqual(state_machine.match('abcabc'), 'abcabc')

        self.assertEqual(state_machine.match('abcDabc'), 'abc')



    def test_concat(self):

        state_machine = StateMachine.from_string('ab')

        state_machine = state_machine.concat(StateMachine.from_string('cd'))



        self.assertEqual(state_machine.match('abcd'), 'abcd')

        self.assertEqual(state_machine.match('abcde'), 'abcd')

        self.assertIsNone(state_machine.match('abc'))





class TestRegex(unittest.TestCase):



    def test_identifier_regex(self):

        regex = Regex('[a-zA-Z_][a-zA-Z0-9_]*')

        self.assertEqual(regex.match('a'), 'a')

        self.assertFalse(regex.match('0'))

        self.assertTrue(regex.match('a0'))

        self.assertEqual(regex.match('a0_3bd'), 'a0_3bd')

        self.assertEqual(regex.match('abd-43'), 'abd')



    def test_parentheses(self):

        regex = Regex('d(ab)*')

        self.assertEqual(regex.match('d'), 'd')

        self.assertEqual(regex.match('dab'), 'dab')

        self.assertEqual(regex.match('daa'), 'd')

        self.assertEqual(regex.match('dabab'), 'dabab')



    def test_union(self):

        regex = Regex('(ab*d)|(AG)')

        self.assertEqual(regex.match('adG'), 'ad')

        self.assertEqual(regex.match('AGfe'), 'AG')





if __name__ == '__main__':

    unittest.main()

python python-3.x parsing regex reinventing-the-wheel

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

edited Oct 26 '18 at 20:15

asked Oct 26 '18 at 15:16

User319

741516

asked Oct 26 '18 at 15:16

User319

741516

asked Oct 26 '18 at 15:16

User319

741516

$begingroup$
Hello, this looks like a very interesting question and a well written piece of code! By any chance, would you have any small example that could be used to test your code ?
$endgroup$
– Josay
Oct 26 '18 at 16:14

$begingroup$
@Josay See edit
$endgroup$
– User319
Oct 26 '18 at 20:16

add a comment |

$begingroup$
Hello, this looks like a very interesting question and a well written piece of code! By any chance, would you have any small example that could be used to test your code ?
$endgroup$
– Josay
Oct 26 '18 at 16:14

$begingroup$
@Josay See edit
$endgroup$
– User319
Oct 26 '18 at 20:16

Hello, this looks like a very interesting question and a well written piece of code! By any chance, would you have any small example that could be used to test your code ?

– Josay
Oct 26 '18 at 16:14

@Josay See edit

– User319
Oct 26 '18 at 20:16

add a comment |

1 Answer
1

active

oldest

votes

Yay! You ran flake8 and followed PEP-8. Nice clean code.

    self.assertEqual(state_machine.match('abc'), 'abc')

Ummm, this is arguably backwards.
Convention for xUnit in many languages is to assertEqual(expected, computed).
It can affect how the diagnostic output is displayed for a failure.

    state_machine = state_machine.union(StateMachine.from_string('def'))

Choosing the name union for your public API is perhaps slightly confusing.
"Union" is drawn from set theory,
while "alternation" is the term the regex literature tends to use for |.

    state_machine = StateMachine.from_string('abc')

The class name is perfectly clear, it's great.
For a local variable that we'll be using a bunch, sm would have sufficed.
You already have a line that verifies that .from_string() doesn't blow up, so
consider combining two assignments on a single line:

    sm = StateMachine.from_string('abc').kleene()

The Regex class is wonderfully straightforward.
Pat yourself on the back.

The peek method in the lexer is perhaps a little on the tricky side,
and would benefit from comments
about when we consume something or not.
I'm looking for invariants on pos and the stack.
I like the assert in find_parent_of_terminal, and its comment.

        to_explore.update({node for node in current.children 

                           if node not in visited})

That's just set difference, yes? children - visited

Overall, looks good. Ship it!

answered 1 hour ago

J_H

4,552131

add a comment |

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f206333%2fminimal-regex-engine%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

Yay! You ran flake8 and followed PEP-8. Nice clean code.

    self.assertEqual(state_machine.match('abc'), 'abc')

Ummm, this is arguably backwards.
Convention for xUnit in many languages is to assertEqual(expected, computed).
It can affect how the diagnostic output is displayed for a failure.

    state_machine = state_machine.union(StateMachine.from_string('def'))

Choosing the name union for your public API is perhaps slightly confusing.
"Union" is drawn from set theory,
while "alternation" is the term the regex literature tends to use for |.

    state_machine = StateMachine.from_string('abc')

    sm = StateMachine.from_string('abc').kleene()

The Regex class is wonderfully straightforward.
Pat yourself on the back.

        to_explore.update({node for node in current.children 

                           if node not in visited})

That's just set difference, yes? children - visited

Overall, looks good. Ship it!

answered 1 hour ago

J_H

4,552131

add a comment |

Yay! You ran flake8 and followed PEP-8. Nice clean code.

    self.assertEqual(state_machine.match('abc'), 'abc')

Ummm, this is arguably backwards.
Convention for xUnit in many languages is to assertEqual(expected, computed).
It can affect how the diagnostic output is displayed for a failure.

    state_machine = state_machine.union(StateMachine.from_string('def'))

Choosing the name union for your public API is perhaps slightly confusing.
"Union" is drawn from set theory,
while "alternation" is the term the regex literature tends to use for |.

    state_machine = StateMachine.from_string('abc')

    sm = StateMachine.from_string('abc').kleene()

The Regex class is wonderfully straightforward.
Pat yourself on the back.

        to_explore.update({node for node in current.children 

                           if node not in visited})

That's just set difference, yes? children - visited

Overall, looks good. Ship it!

answered 1 hour ago

J_H

4,552131

add a comment |

Yay! You ran flake8 and followed PEP-8. Nice clean code.

    self.assertEqual(state_machine.match('abc'), 'abc')

Ummm, this is arguably backwards.
Convention for xUnit in many languages is to assertEqual(expected, computed).
It can affect how the diagnostic output is displayed for a failure.

    state_machine = state_machine.union(StateMachine.from_string('def'))

Choosing the name union for your public API is perhaps slightly confusing.
"Union" is drawn from set theory,
while "alternation" is the term the regex literature tends to use for |.

    state_machine = StateMachine.from_string('abc')

    sm = StateMachine.from_string('abc').kleene()

The Regex class is wonderfully straightforward.
Pat yourself on the back.

        to_explore.update({node for node in current.children 

                           if node not in visited})

That's just set difference, yes? children - visited

Overall, looks good. Ship it!

answered 1 hour ago

J_H

4,552131

Yay! You ran flake8 and followed PEP-8. Nice clean code.

    self.assertEqual(state_machine.match('abc'), 'abc')

Ummm, this is arguably backwards.
Convention for xUnit in many languages is to assertEqual(expected, computed).
It can affect how the diagnostic output is displayed for a failure.

    state_machine = state_machine.union(StateMachine.from_string('def'))

Choosing the name union for your public API is perhaps slightly confusing.
"Union" is drawn from set theory,
while "alternation" is the term the regex literature tends to use for |.

    state_machine = StateMachine.from_string('abc')

    sm = StateMachine.from_string('abc').kleene()

The Regex class is wonderfully straightforward.
Pat yourself on the back.

        to_explore.update({node for node in current.children 

                           if node not in visited})

That's just set difference, yes? children - visited

Overall, looks good. Ship it!

answered 1 hour ago

J_H

4,552131

answered 1 hour ago

J_H

4,552131

answered 1 hour ago

J_H

4,552131

answered 1 hour ago

J_H

4,552131

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl