Source code for starstar.docstr

"""

.. note::
    
    It currently works for Google and Numpy docstrings, but it's still a bit of a WIP.

It tries its best to preserve the surrounding whitespace, and it will separate out
whitespace, below, above, and to the left (common indentation) of the text block, so 
you can edit the content of the block and the rest of it will be preserved.

.. code-block:: python

    import starstar.docstr as ssdoc

    doc = ssdoc.parse('''This is my docstring description.

    Args:
        a (str): this is a
        b (str): this is b
            yes its b
            yes b

    I'm some more docs

    Returns:
        dict: the special thing
        list: other

    alksdfj
    ''')

    # renders the docstring (looks the same as the input)
    print(doc)

    # see the breakdown of the different sections/parameters.
    print(repr(doc))

    doc['args'].append(doc.Param.new(
        'c', 'list', 
        'this is c, that holds some values.\\nsome more text.'))
    
    assert str(doc) == '''This is my docstring description.

    Args:
        a (str): this is a
        b (str): this is b
            yes its b
            yes b
        c (list): this is c, that holds some values.
            some more text.

    I'm some more docs

    Returns:
        dict: the special thing
        list: other

    alksdfj
    '''
"""
from __future__ import annotations
from typing import List, Union, cast, overload, TypeVar, Generic
import re
# import copy
import inspect

# https://realpython.com/documenting-python-code/#docstring-types

BodyType = TypeVar('BodyType', bound='Block|str')


class BaseBlock(Generic[BodyType]):
    '''This represents a block of text. This will separate out any leading or trailing blank lines
    as well as factor out the common indentation. This lets you safely modify the content of the
    block while preserving the surrounding whitespace. It also can use an optional section title
    with it's own separate indentation (like with google docstring sections).

    .. code-block:: python

        block = Block('\n\n\n    blah\n\n\n\n')
        block.body = ['blorg', 'blagh']
        assert str(block) == '\n\n\n    blorg\n    blagh\n\n\n\n'
    '''
    leading: list[str]
    trailing: list[str]
    _body: list[BodyType]

    INDENT_WIDTH = 4
    def __init__(self, text, title=None, name=None, kind=None, cleandoc=False, end_newline=True, raw=False, indent=0):
        self._body = []
        if cleandoc:  # NOTE: this fails if Block is given a list
            text = inspect.cleandoc(text)
        lines = aslines(text, end_newline=end_newline)
        self.leading, self.body, self.trailing, min_indent = (
            ([],lines,[],None) if raw else separate_whitespace(lines))

        # get the title/body indent offset
        min_indent = min_indent or 0
        title_indent = len(title) - len(title.lstrip()) if title else min_indent
        self.min_indent = min(title_indent, min_indent) + (indent or 0)
        self.child_indent = min_indent - title_indent
        # store everything
        self.kind = kind
        self.title = title = title.lstrip() if title else None
        self.name = name or title

    def __repr__(self):
        '''A string representation of the block that shows the 
        division of sections.
        '''
        return border(self._format_body(), f'{self.__class__.__name__}(name={self.name})')

    def __str__(self):
        '''The fully formatted string.'''
        return self._format_body()

    def __bool__(self):
        '''Checks if the block has any non-whitespace content. 
        Analogous to ``bool('    '.strip())``  
        '''
        return bool(
            self.title or 
            any(l.strip() if isinstance(l, str) else l for l in self.body) or 
            any(l.strip() for l in self.leading + self.trailing))

    def __eq__(self, other):
        '''Checks if the string representations are the same. Whitespace does count.'''
        return str(self) == str(other)

    def __iter__(self):
        '''Iterate over lines in the body. Does not include whitespace or title.'''
        return iter(self.body)

    # def format(self, mode='s'):
    #     '''Format the block. Equivalent to ``str(block)``.'''
    #     return self._format_body(mode=mode)

    def _format_body(self, body=None, mode='s'):  # type: ignore
        body = self.body if body is None else aslines(body, end_newline=True)
        if mode == 'r':  # allow drawing boxes around children
            body = [repr(l) if isinstance(l, Block) else l for l in body]
        # indent the body and make sure it ends with a new line
        body = [indent(str(l), self.min_indent + self.child_indent) for l in body]
        title = [indent(self.title, self.min_indent)] if self.title else []
        # join everything together
        return ''.join(map(str, title + self.leading + body + self.trailing))

    # lets you set body with a string.
    @property
    def body(self): return self._body
    @body.setter
    def body(self, value): 
        self._body = self._prepare_body_lines(aslines(value, end_newline=True))

    def _prepare_body_lines(self, lines: list[str]) -> list[BodyType]:
        raise NotImplementedError

    def children(self, kind=..., name=..., include_self=False):
        '''Return children recursively matching some query.
        
        A block can contain other blocks, so this could be used to iterate over 
        all "Example" or "Arguments" blocks, for example.

        Arguments:
            kind (str): The block kind to match - e.g. ``'args'``, ``'returns'``
            name (str): The block name to match. This is the exact name of the section,
                So if you had a custom section ``My Examples:``, you'd enter ``'my examples'``.
                
        .. note::

            The difference between ``name`` and ``kind`` is that ``kind`` has recognized 
            sections with multiple variations and will normalize the name so a single string
            e.g. ``Arguments, Args => ARGS``. This makes it easier to search for all ARGS
            sections without having to check for each possible variation. ``name`` is there 
            when you want to search with the actual name of the section.
        '''
        if include_self and (kind == ... or self.kind == kind.upper()) and (name == ... or (self.name or '').lower() == name.lower()):
            yield self
        for b in self.body:
            if isinstance(b, BaseBlock):
                yield from b.children(kind, name, include_self=True)

    def first(self, kind=..., name=...):
        '''Return the first child matching a query. See ``children`` for arguments.'''
        return next(iter(self.children(kind, name)), None)

    @overload
    def __getitem__(self, k: str|int) -> BodyType: ...
    @overload
    def __getitem__(self, k: slice) -> list[BodyType]: ...
    
    def __getitem__(self, k: int|slice|str) -> BodyType|list[BodyType]:
        '''Return a direct child matching a name or index. Basically, you can 
        index like a list or a dict.
        '''
        if isinstance(k, (int, slice)):
            return self.body[k]
        for b in self.body:
            if nocaseeq(getattr(b, 'name', None), k):
                return b
        raise KeyError(k)

    def __delitem__(self, k: int|slice|str):
        '''Delete a line from the Block.'''
        if isinstance(k, (int, slice)):
            del self.body[k]
        else:
            for i, b in enumerate(self.body):
                if nocaseeq(getattr(b, 'name', None), k):
                    del self.body[i]
                    return
            raise KeyError(k)

    def get(self, k: str|int, default=None):
        try:
            return self[k]
        except (KeyError, IndexError):
            return default

    # basic manipulation

    def prepend(self, *x: BodyType):
        '''Prepend lines to the body.'''
        self.body[0:0] = x
        return self

    def append(self, *x: BodyType):
        '''Append lines to the body.'''
        self.body.extend(x)
        return self

    def indent(self, n: int=1, width: int|None=None):
        '''Indent the entire block.'''
        width = self.INDENT_WIDTH if width is None else width
        self.min_indent = max(self.min_indent + n * width, 0)
        return self

    def dedent(self, n: int=1, width: int|None=None):
        '''Dedent the entire block.'''
        return self.indent(-n, width)

    def set_indent(self, indent: int):
        if indent is not None:
            self.min_indent = indent
        return self

    # def strip(self): # XXX: THIS SHOULDNT HAPPEN INPLACE !!
    #     '''Strip whitespace from the block.'''
    #     self.leading, self.trailing = [], []
    #     return self

    def section_partition(self, pattern: str, get_kind=None, break_unnamed_sections=None):
        self.body = _section_partition(''.join(map(str, self.body)), pattern, get_kind, break_unnamed_sections)
        return self




[docs]class Block(BaseBlock[Union[str, BaseBlock[str]]]): def _prepare_body_lines(self, lines: list[str]) -> list[str]: return lines
[docs]class Param(Block): '''Represents the text belonging to a single parameter. Most of this code is just to facilitate the parsing, changing, and reformatting of the parameter data. ''' _keys = () _format=None pattern: re.Pattern|str = '' name = dtype = desc = None changed = False
[docs] def __init__(self, text, can_be_unnamed=False, block_kind=None, **kw): self.__data = {} self.pattern = p = re.compile(self.pattern) self._keys = self._keys or [k for k, i in sorted(p.groupindex.items(), key=lambda x: x[1])] self._can_be_unnamed = can_be_unnamed self.block_kind = block_kind super().__init__(text + '\n' if not text.endswith('\n') else text, kind='PARAM', **kw) # parse out the data m = re.match(p, ''.join(map(str, self.body)) + '\n') self.__data = self.prepare(**(m.groupdict() if m else {})) self.__dict__.update(self.__data) self.changed = False
[docs] def __setattr__(self, k, v): if k in self._keys: # track changes self.__data[k] = v self.changed = True object.__setattr__(self, k, v)
def get(self, k): return getattr(self, k, None) def update(self, **kw): # update multiple if set(kw) - set(self._keys): raise TypeError(set(kw) - set(self._keys)) self.__data.update(kw) self.__dict__.update(kw, changed=True) def _format_body(self, body=None, *a, **kw): if self.changed and body is None: # the param text was changed - recompile the text self.body = self.format(**self.__data).splitlines(keepends=True) self.changed = False return super()._format_body(body, *a, **kw) def format(self, **kw): raise NotImplementedError def replace(self, **kw): return self.__class__( self.format(**dict(self.__data, **kw)), can_be_unnamed=self._can_be_unnamed) @classmethod def new(cls, *a, **kw): return cls(cls.format(cls, *a, **kw)) def prepare(self, **kw): return kw
class Docstring(BaseBlock[Block]): '''This represents an entire docstring with each section ''' HEADER_GROUPS = dict( ARGS=["Arguments", "Args", "Parameters", "Params"], EXCEPT=["Raises", "Exceptions", "Except"], ATTRS=["Attributes"], EXAMPLE=["Example", "Examples"], RETURN=["Returns"], YIELD=["Yields"]) SUPPORTS_PARAMS = ('ARGS', 'ATTRS') SUPPORTS_UNNAMED_PARAMS = ('RETURN', 'YIELD', 'EXCEPT') FIRST_BLOCK_DEFAULT_NAME = 'Description' FIRST_BLOCK_DEFAULT_KIND = 'DESC' header_format = r'^({}):? *\n' class Param(Param): pass def __init__(self, doc=None, name=None, cleandoc=True): if callable(doc): name = name or doc.__name__ doc = getattr(doc, '__doc__', None) self.doc = doc self.name = name super().__init__(doc or '', name=name, cleandoc=cleandoc, raw=True) self.parse() def __repr__(self): return border(self._format_body(mode='r'), f'{self.__class__.__name__}(name={self.name})') def parse(self): kinds = {v: k for k, vs in self.HEADER_GROUPS.items() for v in vs} self.body = body = _section_partition( ''.join(map(str, self.body)), self.header_format.format("|".join(kinds)), kinds.get, self._break_unnamed_sections) for i, b in enumerate(body): self.handle_section(b, i) return self def _break_unnamed_sections(self, body): return [body] def handle_section(self, block: Block, i: int): if not i: block.name = block.name or self.FIRST_BLOCK_DEFAULT_NAME block.kind = block.kind or self.FIRST_BLOCK_DEFAULT_KIND if block.kind in self.SUPPORTS_UNNAMED_PARAMS: block.body = [self.Param(l, can_be_unnamed=True) for l in _group_indents(block.body)] elif block.kind in self.SUPPORTS_PARAMS: block.body = [self.Param(l) for l in _group_indents(block.body)] return block def children(self, *a, **kw): for p in self.body: yield from p.children(*a, include_self=True, **kw) def first(self, *a, **kw): return next(iter(self.children(*a, **kw)), None) def _prepare_body_lines(self, lines: list[str]) -> list[Block]: return [Block(l) if not isinstance(l, Block) else l for l in lines]
[docs]class Google(Docstring): '''Google docstring parser. .. code-block:: python ds = Google("""This is my docstring description. Args: a (str): this is a b (str): this is b yes its b yes b I'm some more docs Returns: dict: the special thing list: other alksdfj .. code-block:: python print("this is something") print("hmm nice") ksnkdsjkdsksd ljksdfkjl asjkldf .. code-block:: python print("neat!") """) ''' header_format = r'^({}):? *\n' def _break_unnamed_sections(self, body): '''This breaks of mid doc text (in between sections)''' return _break_minimum_indent(body)
[docs] class Param(Param): '''The Google parameter format. .. code-block:: {name} ({type}): {description} ''' pattern = r'^ *(?P<name>\w*) *(?:\((?P<dtype>[^)]+)\))?: *(?P<desc>(?:\n|.)*)' def format(self, name=None, dtype=None, desc=None): return '{}: {}'.format( f'{name} ({dtype})' if name and dtype else name or dtype, # ' '.join(filter(None, [name, f'({dtype})' if dtype else None])), indent(desc).lstrip(' ')) def prepare(self, name=None, dtype=None, desc=None): # if self.block_kind != 'ARGS' and not dtype: # name, dtype = None, name return dict(name=name, dtype=dtype, desc=inspect.cleandoc(desc or ''))
[docs]class Numpy(Docstring): '''Numpy docstring parser. .. code-block:: python ds = Numpy("""Gets and prints the spreadsheet's header columns Parameters ---------- file_loc : str The file location of the spreadsheet print_cols : bool, optional A flag used to print the columns to the console (default is False) asjdfklasjdfkl Returns ------- list a list of strings representing the header columns """) ''' header_format = r'^({}) *\n[-=]+ *\n' def _break_unnamed_sections(self, body): return _break_full_newline(body)
[docs] class Param(Param): '''The Numpy parameter format. .. code-block:: {name} : {type} {description} ''' pattern = r'^ *(?P<name>\w+) *(?:: *(?P<dtype>[^\n]+))? *\n(?P<desc>(?: +.*\n)*)' def format(self, name=None, dtype=None, desc=None): return '{}\n{}'.format(' : '.join(filter(None, [name, dtype])), indent(desc))
# block utils def _break_sections(doc, pattern): '''Break text and body pairs for doc sections with headers.''' # find all matches matches = list(re.finditer(pattern, doc, flags=re.M)) yield '', doc[:matches[0].start() if matches else None] for m1, m2 in zip(matches, matches[1:] + [None]): # type: ignore # select the title of the heading title = doc[m1.start():m1.end()] body = doc[m1.end():m2 and m2.start()] yield title, body def _section_partition(doc, pattern, get_kind=None, break_unnamed_sections=None): sections = [] for title, body in _break_sections(doc, pattern): # pull info from title match = re.match(pattern, title) if title else None name = match.group(1) if match else None kind = get_kind(name) if get_kind else None body, *others = break_unnamed_sections(body) if break_unnamed_sections else [body] sections.append(Block(body, title, name, kind=kind)) for other in others: sections.append(Block(other)) return sections def _break_minimum_indent(section): '''Break indent when the indentation drops below the indentation of the first line. Example .. code-block:: rst arg1 (int): laksdfjklaj --- break here --- This is some discussion ''' lines = section.splitlines(keepends=True) indent = next(( len(l) - len(l.lstrip()) for l in lines if l.strip()), None) if indent is None: return [lines] i_break = next(( i for i, l in enumerate(lines) if l.strip() and len(l) - len(l.lstrip()) < indent ), None) if i_break is None or i_break >= len(lines): return [lines] return [lines[:i_break], lines[i_break:]] # def _break_full_newline(section): # '''Break after an entirely blank line''' # lines = section.splitlines(keepends=True) # i_start = next((i+1 for i, l in enumerate(lines) if l.strip()), None) # i_break = next((i+1 for i, l in enumerate(lines) if i >= i_start and not l.strip()), None) # if i_break is None or i_break >= len(lines): # return [lines] # return [lines[:i_break], lines[i_break:]] def _ibreak_full_newline(section): '''Break after an entirely blank line''' lines = section.splitlines(keepends=True) i_blank = -1 offset = 0 for i, l in enumerate(lines): i_blank = i if i_blank < offset and not l.strip() else i_blank if i_blank > offset and l.strip(): yield lines[offset:i] offset = i if not offset or offset < len(lines): yield lines[offset:] def _break_full_newline(section): return list(_ibreak_full_newline(section)) def _group_indents(lines): '''Break into indentation groups (one for every top level group). e.g. google arguments. Example .. code-block:: rst arg1 (int): laksdfjklaj jasflkjdslkf --- break here --- arg2 (int): lkasdfjlaksdf --- break here --- arg3 (int): laksfdkjla ''' indents = [len(l) - len(l.lstrip()) for l in lines] min_indent = min((i for i, l in zip(indents, lines) if l.strip()), default=0) tops = [i for i, (idt, l) in enumerate(zip(indents, lines)) if l.strip() and min_indent >= idt] groups = [] if tops and tops[0]: groups.append(lines[:tops[0]]) for i, j in zip([0] + tops, tops + [None]): # type: ignore if i != j: groups.append(lines[i:j]) return [''.join(ls) for ls in groups] # text utils def indent(text, n=4): '''Indent a block of text''' return ''.join([' '*n + l for l in str(text).splitlines(keepends=True)]) def comment(text, ch='#'): '''Add a prefix to each line of a text block. Like commenting out code.''' return ''.join(f'{ch} {l}' for l in str(text).splitlines(keepends=True)) def border(text, title=None, top_ch='-', side_ch='|', n_top=3): '''Adds a border to the left and top sides, with an optional title.''' return top_ch * n_top + (f' {title} {top_ch}' if title else '') + '\n' + comment(text, side_ch) def nocaseeq(a, b): '''Compare two values case-insensitively.''' return (a.lower() if isinstance(a, str) else a) == (b.lower() if isinstance(b, str) else b) def aslines(text, end_newline=True) -> list[str]: '''Convert a block of text to lines, preserving new lines.''' lines = text.splitlines(keepends=True) if isinstance(text, str) else text or [] if end_newline and lines and isinstance(lines[-1], str) and not lines[-1].endswith('\n'): lines[-1] = lines[-1] + '\n' return lines # type: ignore # def separate_whitespace(lines): # '''Separate whitespace above, below, and common indentation for a block of text.''' # # break off the leading and trailing spaces # nonblank = [i for i, l in enumerate(lines) if l.strip()] # i, j = (nonblank[0], nonblank[-1]+1) if nonblank else (0,0) # leading, body, trailing = lines[:i], lines[i:j], lines[j:] # # separate off the indentation from the body / title # min_indent = min(( # len(l) - len(l.lstrip()) for l in body # if l.strip()), default=None) # body = [l[min_indent:] if l.strip() else l for l in body] # return leading, body, trailing, min_indent def separate_whitespace(lines): '''Separate whitespace above, below, and common indentation for a block of text.''' # break off the leading and trailing spaces is_block = [isinstance(l, Block) for l in lines] has_content = [is_block[i] or bool(l.strip()) for i, l in enumerate(lines)] nonblank = [i for i, l in enumerate(lines) if has_content[i]] i, j = (nonblank[0], nonblank[-1]+1) if nonblank else (0,0) leading, body, trailing = lines[:i], lines[i:j], lines[j:] is_block, has_content = is_block[i:j], has_content[i:j] # print(leading, body, trailing) # separate off the indentation from the body / title min_indent = min(( l.min_indent if is_block[i] else len(l) - len(l.lstrip()) for i, l in enumerate(body) if has_content[i]), default=None) body = [ l.set_indent(min_indent) if is_block[i] else l[min_indent:] if has_content[i] else l for i, l in enumerate(body) ] return leading, body, trailing, min_indent STYLES = { 'google': Google, 'numpy': Numpy, } def parse(func, style: str|None=None, **kw): for s in [style] if isinstance(style, str) else style or list(STYLES): return STYLES[s](func, **kw).parse() # def _resub_groups(pattern, body, *a, **kw): # FIXME: how to handle missing ??? # '''Replace the content of regex capture groups.''' # p = re.compile(pattern, re.M) if not isinstance(pattern, re.Pattern) else pattern # names, idxs = map(list, zip(*sorted(p.groupindex.items(), key=lambda x: x[1]))) # a = list(a) + [kw.pop(n, None) for n in names[len(a):]] # m = p.match(body) # joined = '' # for k, kj, x in zip(idxs, idxs[1:]+[None], a): # print('replacing', k and m.end(k), kj and m.start(kj), repr(str(x)), repr(body[k and m.end(k):kj and m.start(kj)])) # joined += f'{x}' + body[k and m.end(k):kj and m.start(kj)] # print('substituted:', repr(joined)) # return joined if __name__ == '__main__': ds = Google('''This is my docstring description. Args: a (str): this is a b (str): this is b yes its b yes b I'm some more docs Returns: dict: the special thing list: other alksdfj .. code-block:: python print("this is something") print("hmm nice") ksnkdsjkdsksd ljksdfkjl asjkldf .. code-block:: python print("neat!") ''').parse() ds['args'].append(ds.Param.new('somethingggg', None, 'kjlsadjkfdsajkl\ndfgad\nasdf')) ds['args'].append(ds.Param.new(None, 'dict', '819723789')) ds['args'].append(ds.Param.new( 'nested_kw', 'dict', Block([ # '\n', '\n', 'nested parameters that go to somewhere else\n', ds.Param.new('a', 'int', 'aaa'), ds.Param.new('b', 'int', 'aaa'), ]))) print(repr(ds)) print(ds) ds2 = Google('''This is my docstring description. Args: x (str): this is x y: im y yes hi alksdfj ''').parse() ds['args'].append(*ds2['args'].body) ds['args']['somethingggg'].name = 'bsak' print(repr(ds)) print(ds) # print(ds.compose()) ds = Numpy('''Gets and prints the spreadsheet's header columns Parameters ---------- file_loc : str The file location of the spreadsheet print_cols : bool, optional A flag used to print the columns to the console (default is False) asjdfklasjdfkl Returns ------- list a list of strings representing the header columns ''').parse() ds['Parameters'].append(ds.Param.new('somethingggg', None, 'kjlsadjkfdsajkl\n dfgad\n asdf')) ds['Parameters'].append(ds.Param.new(None, 'dict', '819723789')) # print(ds.compose()) print(repr(ds)) ds['Parameters']['somethingggg'].name = 'bsak' p = ds['Parameters']['dict'] p.name = 'bloop' p.desc = 'hello\nasdf' print(repr(p)) print(ds) # print(ds.compose())