sphinx/sphinx/pycode/parser.py

"""
    sphinx.pycode.parser
    ~~~~~~~~~~~~~~~~~~~~

    Utilities parsing and analyzing Python code.

    :copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""
import ast
import inspect
import itertools
import re
import sys
import tokenize
from token import NAME, NEWLINE, INDENT, DEDENT, NUMBER, OP, STRING
from tokenize import COMMENT, NL
from typing import Any, Dict, List, Tuple


comment_re = re.compile('^\\s*#: ?(.*)\r?\n?$')
indent_re = re.compile('^\\s*$')
emptyline_re = re.compile('^\\s*(#.*)?$')


if sys.version_info >= (3, 6):
    ASSIGN_NODES = (ast.Assign, ast.AnnAssign)
else:
    ASSIGN_NODES = (ast.Assign)


def filter_whitespace(code: str) -> str:
    return code.replace('\f', ' ')  # replace FF (form feed) with whitespace


def get_assign_targets(node: ast.AST) -> List[ast.expr]:
    """Get list of targets from Assign and AnnAssign node."""
    if isinstance(node, ast.Assign):
        return node.targets
    else:
        return [node.target]  # type: ignore


def get_lvar_names(node: ast.AST, self: ast.arg = None) -> List[str]:
    """Convert assignment-AST to variable names.

    This raises `TypeError` if the assignment does not create new variable::

        ary[0] = 'foo'
        dic["bar"] = 'baz'
        # => TypeError
    """
    if self:
        self_id = self.arg

    node_name = node.__class__.__name__
    if node_name in ('Index', 'Num', 'Slice', 'Str', 'Subscript'):
        raise TypeError('%r does not create new variable' % node)
    elif node_name == 'Name':
        if self is None or node.id == self_id:  # type: ignore
            return [node.id]  # type: ignore
        else:
            raise TypeError('The assignment %r is not instance variable' % node)
    elif node_name in ('Tuple', 'List'):
        members = []
        for elt in node.elts:  # type: ignore
            try:
                members.extend(get_lvar_names(elt, self))
            except TypeError:
                pass
        return members
    elif node_name == 'Attribute':
        if node.value.__class__.__name__ == 'Name' and self and node.value.id == self_id:  # type: ignore  # NOQA
            # instance variable
            return ["%s" % get_lvar_names(node.attr, self)[0]]  # type: ignore
        else:
            raise TypeError('The assignment %r is not instance variable' % node)
    elif node_name == 'str':
        return [node]  # type: ignore
    elif node_name == 'Starred':
        return get_lvar_names(node.value, self)  # type: ignore
    else:
        raise NotImplementedError('Unexpected node name %r' % node_name)


def dedent_docstring(s: str) -> str:
    """Remove common leading indentation from docstring."""
    def dummy() -> None:
        # dummy function to mock `inspect.getdoc`.
        pass

    dummy.__doc__ = s
    docstring = inspect.getdoc(dummy)
    return docstring.lstrip("\r\n").rstrip("\r\n")


class Token:
    """Better token wrapper for tokenize module."""

    def __init__(self, kind: int, value: Any, start: Tuple[int, int], end: Tuple[int, int],
                 source: str) -> None:
        self.kind = kind
        self.value = value
        self.start = start
        self.end = end
        self.source = source

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, int):
            return self.kind == other
        elif isinstance(other, str):
            return self.value == other
        elif isinstance(other, (list, tuple)):
            return [self.kind, self.value] == list(other)
        elif other is None:
            return False
        else:
            raise ValueError('Unknown value: %r' % other)

    def match(self, *conditions) -> bool:
        return any(self == candidate for candidate in conditions)

    def __repr__(self) -> str:
        return '<Token kind=%r value=%r>' % (tokenize.tok_name[self.kind],
                                             self.value.strip())


class TokenProcessor:
    def __init__(self, buffers: List[str]) -> None:
        lines = iter(buffers)
        self.buffers = buffers
        self.tokens = tokenize.generate_tokens(lambda: next(lines))
        self.current = None     # type: Token
        self.previous = None    # type: Token

    def get_line(self, lineno: int) -> str:
        """Returns specified line."""
        return self.buffers[lineno - 1]

    def fetch_token(self) -> Token:
        """Fetch a next token from source code.

        Returns ``False`` if sequence finished.
        """
        try:
            self.previous = self.current
            self.current = Token(*next(self.tokens))
        except StopIteration:
            self.current = None

        return self.current

    def fetch_until(self, condition: Any) -> List[Token]:
        """Fetch tokens until specified token appeared.

        .. note:: This also handles parenthesis well.
        """
        tokens = []
        while self.fetch_token():
            tokens.append(self.current)
            if self.current == condition:
                break
            elif self.current == [OP, '(']:
                tokens += self.fetch_until([OP, ')'])
            elif self.current == [OP, '{']:
                tokens += self.fetch_until([OP, '}'])
            elif self.current == [OP, '[']:
                tokens += self.fetch_until([OP, ']'])

        return tokens


class AfterCommentParser(TokenProcessor):
    """Python source code parser to pick up comment after assignment.

    This parser takes a python code starts with assignment statement,
    and returns the comments for variable if exists.
    """

    def __init__(self, lines: List[str]) -> None:
        super().__init__(lines)
        self.comment = None  # type: str

    def fetch_rvalue(self) -> List[Token]:
        """Fetch right-hand value of assignment."""
        tokens = []
        while self.fetch_token():
            tokens.append(self.current)
            if self.current == [OP, '(']:
                tokens += self.fetch_until([OP, ')'])
            elif self.current == [OP, '{']:
                tokens += self.fetch_until([OP, '}'])
            elif self.current == [OP, '[']:
                tokens += self.fetch_until([OP, ']'])
            elif self.current == INDENT:
                tokens += self.fetch_until(DEDENT)
            elif self.current == [OP, ';']:
                break
            elif self.current.kind not in (OP, NAME, NUMBER, STRING):
                break

        return tokens

    def parse(self) -> None:
        """Parse the code and obtain comment after assignment."""
        # skip lvalue (or whole of AnnAssign)
        while not self.fetch_token().match([OP, '='], NEWLINE, COMMENT):
            assert self.current

        # skip rvalue (if exists)
        if self.current == [OP, '=']:
            self.fetch_rvalue()

        if self.current == COMMENT:
            self.comment = self.current.value


class VariableCommentPicker(ast.NodeVisitor):
    """Python source code parser to pick up variable comments."""

    def __init__(self, buffers: List[str], encoding: str) -> None:
        self.counter = itertools.count()
        self.buffers = buffers
        self.encoding = encoding
        self.context = []               # type: List[str]
        self.current_classes = []       # type: List[str]
        self.current_function = None    # type: ast.FunctionDef
        self.comments = {}              # type: Dict[Tuple[str, str], str]
        self.previous = None            # type: ast.AST
        self.deforders = {}             # type: Dict[str, int]
        super().__init__()

    def add_entry(self, name: str) -> None:
        if self.current_function:
            if self.current_classes and self.context[-1] == "__init__":
                # store variable comments inside __init__ method of classes
                definition = self.context[:-1] + [name]
            else:
                return
        else:
            definition = self.context + [name]

        self.deforders[".".join(definition)] = next(self.counter)

    def add_variable_comment(self, name: str, comment: str) -> None:
        if self.current_function:
            if self.current_classes and self.context[-1] == "__init__":
                # store variable comments inside __init__ method of classes
                context = ".".join(self.context[:-1])
            else:
                return
        else:
            context = ".".join(self.context)

        self.comments[(context, name)] = comment

    def get_self(self) -> ast.arg:
        """Returns the name of first argument if in function."""
        if self.current_function and self.current_function.args.args:
            return self.current_function.args.args[0]
        else:
            return None

    def get_line(self, lineno: int) -> str:
        """Returns specified line."""
        return self.buffers[lineno - 1]

    def visit(self, node: ast.AST) -> None:
        """Updates self.previous to ."""
        super().visit(node)
        self.previous = node

    def visit_Import(self, node: ast.Import) -> None:
        """Handles Import node and record it to definition orders."""
        for name in node.names:
            if name.asname:
                self.add_entry(name.asname)
            else:
                self.add_entry(name.name)

    def visit_ImportFrom(self, node: ast.Import) -> None:
        """Handles Import node and record it to definition orders."""
        for name in node.names:
            if name.asname:
                self.add_entry(name.asname)
            else:
                self.add_entry(name.name)

    def visit_Assign(self, node: ast.Assign) -> None:
        """Handles Assign node and pick up a variable comment."""
        try:
            targets = get_assign_targets(node)
            varnames = sum([get_lvar_names(t, self=self.get_self()) for t in targets], [])  # type: List[str]  # NOQA
            current_line = self.get_line(node.lineno)
        except TypeError:
            return  # this assignment is not new definition!

        # check comments after assignment
        parser = AfterCommentParser([current_line[node.col_offset:]] +
                                    self.buffers[node.lineno:])
        parser.parse()
        if parser.comment and comment_re.match(parser.comment):
            for varname in varnames:
                self.add_variable_comment(varname, comment_re.sub('\\1', parser.comment))
                self.add_entry(varname)
            return

        # check comments before assignment
        if indent_re.match(current_line[:node.col_offset]):
            comment_lines = []
            for i in range(node.lineno - 1):
                before_line = self.get_line(node.lineno - 1 - i)
                if comment_re.match(before_line):
                    comment_lines.append(comment_re.sub('\\1', before_line))
                else:
                    break

            if comment_lines:
                comment = dedent_docstring('\n'.join(reversed(comment_lines)))
                for varname in varnames:
                    self.add_variable_comment(varname, comment)
                    self.add_entry(varname)
                return

        # not commented (record deforders only)
        for varname in varnames:
            self.add_entry(varname)

    def visit_AnnAssign(self, node: ast.AST) -> None:  # Note: ast.AnnAssign not found in py35
        """Handles AnnAssign node and pick up a variable comment."""
        self.visit_Assign(node)  # type: ignore

    def visit_Expr(self, node: ast.Expr) -> None:
        """Handles Expr node and pick up a comment if string."""
        if (isinstance(self.previous, ASSIGN_NODES) and isinstance(node.value, ast.Str)):
            try:
                targets = get_assign_targets(self.previous)
                varnames = get_lvar_names(targets[0], self.get_self())
                for varname in varnames:
                    if isinstance(node.value.s, str):
                        docstring = node.value.s
                    else:
                        docstring = node.value.s.decode(self.encoding or 'utf-8')

                    self.add_variable_comment(varname, dedent_docstring(docstring))
                    self.add_entry(varname)
            except TypeError:
                pass  # this assignment is not new definition!

    def visit_Try(self, node: ast.Try) -> None:
        """Handles Try node and processes body and else-clause.

        .. note:: pycode parser ignores objects definition in except-clause.
        """
        for subnode in node.body:
            self.visit(subnode)
        for subnode in node.orelse:
            self.visit(subnode)

    def visit_ClassDef(self, node: ast.ClassDef) -> None:
        """Handles ClassDef node and set context."""
        self.current_classes.append(node.name)
        self.add_entry(node.name)
        self.context.append(node.name)
        self.previous = node
        for child in node.body:
            self.visit(child)
        self.context.pop()
        self.current_classes.pop()

    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
        """Handles FunctionDef node and set context."""
        if self.current_function is None:
            self.add_entry(node.name)  # should be called before setting self.current_function
            self.context.append(node.name)
            self.current_function = node
            for child in node.body:
                self.visit(child)
            self.context.pop()
            self.current_function = None

    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
        """Handles AsyncFunctionDef node and set context."""
        self.visit_FunctionDef(node)  # type: ignore


class DefinitionFinder(TokenProcessor):
    """Python source code parser to detect location of functions,
    classes and methods.
    """

    def __init__(self, lines: List[str]) -> None:
        super().__init__(lines)
        self.decorator = None   # type: Token
        self.context = []       # type: List[str]
        self.indents = []       # type: List
        self.definitions = {}   # type: Dict[str, Tuple[str, int, int]]

    def add_definition(self, name: str, entry: Tuple[str, int, int]) -> None:
        """Add a location of definition."""
        if self.indents and self.indents[-1][0] == 'def' and entry[0] == 'def':
            # ignore definition of inner function
            pass
        else:
            self.definitions[name] = entry

    def parse(self) -> None:
        """Parse the code to obtain location of definitions."""
        while True:
            token = self.fetch_token()
            if token is None:
                break
            elif token == COMMENT:
                pass
            elif token == [OP, '@'] and (self.previous is None or
                                         self.previous.match(NEWLINE, NL, INDENT, DEDENT)):
                if self.decorator is None:
                    self.decorator = token
            elif token.match([NAME, 'class']):
                self.parse_definition('class')
            elif token.match([NAME, 'def']):
                self.parse_definition('def')
            elif token == INDENT:
                self.indents.append(('other', None, None))
            elif token == DEDENT:
                self.finalize_block()

    def parse_definition(self, typ: str) -> None:
        """Parse AST of definition."""
        name = self.fetch_token()
        self.context.append(name.value)
        funcname = '.'.join(self.context)

        if self.decorator:
            start_pos = self.decorator.start[0]
            self.decorator = None
        else:
            start_pos = name.start[0]

        self.fetch_until([OP, ':'])
        if self.fetch_token().match(COMMENT, NEWLINE):
            self.fetch_until(INDENT)
            self.indents.append((typ, funcname, start_pos))
        else:
            # one-liner
            self.add_definition(funcname, (typ, start_pos, name.end[0]))
            self.context.pop()

    def finalize_block(self) -> None:
        """Finalize definition block."""
        definition = self.indents.pop()
        if definition[0] != 'other':
            typ, funcname, start_pos = definition
            end_pos = self.current.end[0] - 1
            while emptyline_re.match(self.get_line(end_pos)):
                end_pos -= 1

            self.add_definition(funcname, (typ, start_pos, end_pos))
            self.context.pop()


class Parser:
    """Python source code parser to pick up variable comments.

    This is a better wrapper for ``VariableCommentPicker``.
    """

    def __init__(self, code: str, encoding: str = 'utf-8') -> None:
        self.code = filter_whitespace(code)
        self.encoding = encoding
        self.comments = {}          # type: Dict[Tuple[str, str], str]
        self.deforders = {}         # type: Dict[str, int]
        self.definitions = {}       # type: Dict[str, Tuple[str, int, int]]

    def parse(self) -> None:
        """Parse the source code."""
        self.parse_comments()
        self.parse_definition()

    def parse_comments(self) -> None:
        """Parse the code and pick up comments."""
        tree = ast.parse(self.code.encode())
        picker = VariableCommentPicker(self.code.splitlines(True), self.encoding)
        picker.visit(tree)
        self.comments = picker.comments
        self.deforders = picker.deforders

    def parse_definition(self) -> None:
        """Parse the location of definitions from the code."""
        parser = DefinitionFinder(self.code.splitlines(True))
        parser.parse()
        self.definitions = parser.definitions