Source code for patsy.parse_formula

 # This file is part of Patsy
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.

# This file defines a parser for a simple language based on S/R "formulas"
# (which are described in sections 2.3 and 2.4 in Chambers & Hastie, 1992). It
# uses the machinery in patsy.parse_core to do the heavy-lifting -- its
# biggest job is to handle tokenization.

from __future__ import print_function

__all__ = ["parse_formula"]

# The Python tokenizer
import tokenize

import six
from six.moves import cStringIO as StringIO

from patsy import PatsyError
from patsy.origin import Origin
from patsy.infix_parser import Token, Operator, infix_parse, ParseNode
from patsy.tokens import python_tokenize, pretty_untokenize
from patsy.util import PushbackAdapter

_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"]

def _is_a(f, v):
    try:
        f(v)
    except ValueError:
        return False
    else:
        return True

# Helper function for _tokenize_formula:
def _read_python_expr(it, end_tokens):
    # Read out a full python expression, stopping when we hit an
    # unnested end token.
    pytypes = []
    token_strings = []
    origins = []
    bracket_level = 0
    for pytype, token_string, origin in it:
        assert bracket_level >= 0
        if bracket_level == 0 and token_string in end_tokens:
            it.push_back((pytype, token_string, origin))
            break
        if token_string in ("(", "[", "{"):
            bracket_level += 1
        if token_string in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise PatsyError("unmatched close bracket", origin)
        pytypes.append(pytype)
        token_strings.append(token_string)
        origins.append(origin)
    # Either we found an end_token, or we hit the end of the string
    if bracket_level == 0:
        expr_text = pretty_untokenize(zip(pytypes, token_strings))
        if expr_text == "0":
            token_type = "ZERO"
        elif expr_text == "1":
            token_type = "ONE"
        elif _is_a(int, expr_text) or _is_a(float, expr_text):
            token_type = "NUMBER"
        else:
            token_type = "PYTHON_EXPR"
        return Token(token_type, Origin.combine(origins), extra=expr_text)
    else:
        raise PatsyError("unclosed bracket in embedded Python "
                            "expression",
                            Origin.combine(origins))

def _tokenize_formula(code, operator_strings):
    assert "(" not in operator_strings
    assert ")" not in operator_strings
    magic_token_types = {"(": Token.LPAREN,
                         ")": Token.RPAREN,
                         }
    for operator_string in operator_strings:
        magic_token_types[operator_string] = operator_string
    # Once we enter a Python expression, a ( does not end it, but any other
    # "magic" token does:
    end_tokens = set(magic_token_types)
    end_tokens.remove("(")
    
    it = PushbackAdapter(python_tokenize(code))
    for pytype, token_string, origin in it:
        if token_string in magic_token_types:
            yield Token(magic_token_types[token_string], origin)
        else:
            it.push_back((pytype, token_string, origin))
            yield _read_python_expr(it, end_tokens)
                    
[docs]def test__tokenize_formula(): code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10" tokens = list(_tokenize_formula(code, ["+", "-", "~"])) expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"), ("~", Origin(code, 2, 3), None), ("PYTHON_EXPR", Origin(code, 4, 5), "a"), ("+", Origin(code, 6, 7), None), (Token.LPAREN, Origin(code, 8, 9), None), ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), (Token.RPAREN, Origin(code, 23, 24), None), ("+", Origin(code, 25, 26), None), ("-", Origin(code, 27, 28), None), ("ONE", Origin(code, 28, 29), "1"), ("+", Origin(code, 30, 31), None), ("ZERO", Origin(code, 32, 33), "0"), ("+", Origin(code, 34, 35), None), ("NUMBER", Origin(code, 36, 38), "10"), ] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == expected[1] assert got.extra == expected[2]
_unary_tilde = Operator("~", 1, -100) _default_ops = [ _unary_tilde, Operator("~", 2, -100), Operator("+", 2, 100), Operator("-", 2, 100), Operator("*", 2, 200), Operator("/", 2, 200), Operator(":", 2, 300), Operator("**", 2, 500), Operator("+", 1, 100), Operator("-", 1, 100), ]
[docs]def parse_formula(code, extra_operators=[]): if not code.strip(): code = "~ 1" for op in extra_operators: if op.precedence < 0: raise ValueError("all operators must have precedence >= 0") operators = _default_ops + extra_operators operator_strings = [op.token_type for op in operators] tree = infix_parse(_tokenize_formula(code, operator_strings), operators, _atomic_token_types) if not isinstance(tree, ParseNode) or tree.type != "~": tree = ParseNode("~", None, [tree], tree.origin) return tree
############# _parser_tests = { "": ["~", "1"], " ": ["~", "1"], " \n ": ["~", "1"], "1": ["~", "1"], "a": ["~", "a"], "a ~ b": ["~", "a", "b"], "(a ~ b)": ["~", "a", "b"], "a ~ ((((b))))": ["~", "a", "b"], "a ~ ((((+b))))": ["~", "a", ["+", "b"]], "a + b + c": ["~", ["+", ["+", "a", "b"], "c"]], "a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]], "a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]], # Note different spacing: "a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]], # Check precedence "a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]], "a + b * c": ["~", ["+", "a", ["*", "b", "c"]]], "-a**2": ["~", ["-", ["**", "a", "2"]]], "-a:b": ["~", ["-", [":", "a", "b"]]], "a + b:c": ["~", ["+", "a", [":", "b", "c"]]], "(a + b):c": ["~", [":", ["+", "a", "b"], "c"]], "a*b:c": ["~", ["*", "a", [":", "b", "c"]]], "a+b / c": ["~", ["+", "a", ["/", "b", "c"]]], "~ a": ["~", "a"], "-1": ["~", ["-", "1"]], } def _compare_trees(got, expected): assert isinstance(got, ParseNode) if got.args: assert got.type == expected[0] for arg, expected_arg in zip(got.args, expected[1:]): _compare_trees(arg, expected_arg) else: assert got.type in _atomic_token_types assert got.token.extra == expected def _do_parse_test(test_cases, extra_operators): for code, expected in six.iteritems(test_cases): actual = parse_formula(code, extra_operators=extra_operators) print(repr(code), repr(expected)) print(actual) _compare_trees(actual, expected)
[docs]def test_parse_formula(): _do_parse_test(_parser_tests, [])
[docs]def test_parse_origin(): tree = parse_formula("a ~ b + c") assert tree.origin == Origin("a ~ b + c", 0, 9) assert tree.token.origin == Origin("a ~ b + c", 2, 3) assert tree.args[0].origin == Origin("a ~ b + c", 0, 1) assert tree.args[1].origin == Origin("a ~ b + c", 4, 9) assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7) assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5) assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
# <> mark off where the error should be reported: _parser_error_tests = [ "a <+>", "a + <(>", "a + b <# asdf>", "<)>", "a + <)>", "<*> a", "a + <*>", "a + <foo[bar>", "a + <foo{bar>", "a + <foo(bar>", "a + <[bar>", "a + <{bar>", "a + <{bar[]>", "a + foo<]>bar", "a + foo[]<]>bar", "a + foo{}<}>bar", "a + foo<)>bar", "a + b<)>", "(a) <.>", "<(>a + b", "a +< >'foo", # Not the best placement for the error ] # Split out so it can also be used by tests of the evaluator (which also # raises PatsyError's) def _parsing_error_test(parse_fn, error_descs): # pragma: no cover for error_desc in error_descs: letters = [] start = None end = None for letter in error_desc: if letter == "<": start = len(letters) elif letter == ">": end = len(letters) else: letters.append(letter) bad_code = "".join(letters) assert start is not None and end is not None print(error_desc) print(repr(bad_code), start, end) try: parse_fn(bad_code) except PatsyError as e: print(e) assert e.origin.code == bad_code assert e.origin.start == start assert e.origin.end == end else: assert False, "parser failed to report an error!"
[docs]def test_parse_errors(extra_operators=[]): def parse_fn(code): return parse_formula(code, extra_operators=extra_operators) _parsing_error_test(parse_fn, _parser_error_tests)
_extra_op_parser_tests = { "a | b": ["~", ["|", "a", "b"]], "a * b|c": ["~", ["*", "a", ["|", "b", "c"]]], }
[docs]def test_parse_extra_op(): extra_operators = [Operator("|", 2, 250)] _do_parse_test(_parser_tests, extra_operators=extra_operators) _do_parse_test(_extra_op_parser_tests, extra_operators=extra_operators) test_parse_errors(extra_operators=extra_operators)