"""because list is complex, split list parser in a new file""" import re from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Match from .util import expand_leading_tab, expand_tab, strip_end if TYPE_CHECKING: from .block_parser import BlockParser from .core import BlockState LIST_PATTERN = ( r"^(?P {0,3})" r"(?P[\*\+-]|\d{1,9}[.)])" r"(?P[ \t]*|[ \t].+)$" ) _LINE_HAS_TEXT = re.compile(r"(\s*)\S") def parse_list(block: "BlockParser", m: Match[str], state: "BlockState") -> int: """Parse tokens for ordered and unordered list.""" text = m.group("list_3") if not text.strip(): # Example 285 # an empty list item cannot interrupt a paragraph end_pos = state.append_paragraph() if end_pos: return end_pos marker = m.group("list_2") ordered = len(marker) > 1 depth = state.depth() token: Dict[str, Any] = { "type": "list", "children": [], "tight": True, "bullet": marker[-1], "attrs": { "depth": depth, "ordered": ordered, }, } if ordered: start = int(marker[:-1]) if start != 1: # Example 304 # we allow only lists starting with 1 to interrupt paragraphs end_pos = state.append_paragraph() if end_pos: return end_pos token["attrs"]["start"] = start state.cursor = m.end() + 1 groups: Optional[Tuple[str, str, str]] = (m.group("list_1"), marker, text) if depth >= block.max_nested_level - 1: rules = list(block.list_rules) rules.remove("list") else: rules = block.list_rules bullet = _get_list_bullet(marker[-1]) while groups: groups = _parse_list_item(block, bullet, groups, token, state, rules) end_pos = token.pop("_end_pos", None) _transform_tight_list(token) if end_pos: index = token.pop("_tok_index") state.tokens.insert(index, token) return end_pos state.append_token(token) return state.cursor def _transform_tight_list(token: Dict[str, Any]) -> None: if token["tight"]: # reset tight list item for list_item in token["children"]: for tok in list_item["children"]: if tok["type"] == "paragraph": tok["type"] = "block_text" elif tok["type"] == "list": _transform_tight_list(tok) def _parse_list_item( block: "BlockParser", bullet: str, groups: Tuple[str, str, str], token: Dict[str, Any], state: "BlockState", rules: List[str], ) -> Optional[Tuple[str, str, str]]: spaces, marker, text = groups leading_width = len(spaces) + len(marker) text, continue_width = _compile_continue_width(text, leading_width) item_pattern = _compile_list_item_pattern(bullet, leading_width) list_item_breaks = [ "thematic_break", "fenced_code", "atx_heading", "block_quote", "block_html", "list", ] if "fenced_directive" in block.specification: list_item_breaks.insert(1, "fenced_directive") pairs = [(name, block.specification[name]) for name in list_item_breaks] if leading_width < 3: _repl_w = str(leading_width) pairs = [(n, p.replace("3", _repl_w, 1)) for n, p in pairs] pairs.insert(1, ("list_item", item_pattern)) regex = "|".join(r"(?P<%s>(?<=\n)%s)" % pair for pair in pairs) sc = re.compile(regex, re.M) src = "" next_group = None prev_blank_line = False pos = state.cursor continue_space = " " * continue_width while pos < state.cursor_max: pos = state.find_line_end() line = state.get_text(pos) if block.BLANK_LINE.match(line): src += "\n" prev_blank_line = True state.cursor = pos continue line = expand_leading_tab(line) if line.startswith(continue_space): if prev_blank_line and not text and not src.strip(): # Example 280 # A list item can begin with at most one blank line break src += line prev_blank_line = False state.cursor = pos continue m = sc.match(state.src, state.cursor) if m: tok_type = m.lastgroup if tok_type == "list_item": if prev_blank_line: token["tight"] = False next_group = (m.group("listitem_1"), m.group("listitem_2"), m.group("listitem_3")) state.cursor = m.end() + 1 break if tok_type == "list": break tok_index = len(state.tokens) end_pos = block.parse_method(m, state) if end_pos: token["_tok_index"] = tok_index token["_end_pos"] = end_pos break if prev_blank_line and not line.startswith(continue_space): # not a continue line, and previous line is blank break src += line state.cursor = pos text += _clean_list_item_text(src, continue_width) child = state.child_state(strip_end(text)) block.parse(child, rules) if token["tight"] and _is_loose_list(child.tokens): token["tight"] = False token["children"].append( { "type": "list_item", "children": child.tokens, } ) if next_group: return next_group return None def _get_list_bullet(c: str) -> str: if c == ".": bullet = r"\d{0,9}\." elif c == ")": bullet = r"\d{0,9}\)" elif c == "*": bullet = r"\*" elif c == "+": bullet = r"\+" else: bullet = "-" return bullet def _compile_list_item_pattern(bullet: str, leading_width: int) -> str: if leading_width > 3: leading_width = 3 return ( r"^(?P {0," + str(leading_width) + "})" r"(?P" + bullet + ")" r"(?P[ \t]*|[ \t][^\n]+)$" ) def _compile_continue_width(text: str, leading_width: int) -> Tuple[str, int]: text = expand_leading_tab(text, 3) text = expand_tab(text) m2 = _LINE_HAS_TEXT.match(text) if m2: # indent code, startswith 5 spaces if text.startswith(" "): space_width = 1 else: space_width = len(m2.group(1)) text = text[space_width:] + "\n" else: space_width = 1 text = "" continue_width = leading_width + space_width return text, continue_width def _clean_list_item_text(src: str, continue_width: int) -> str: # according to Example 7, tab should be treated as 3 spaces rv = [] trim_space = " " * continue_width lines = src.split("\n") for line in lines: if line.startswith(trim_space): line = line.replace(trim_space, "", 1) # according to CommonMark Example 5 # tab should be treated as 4 spaces line = expand_tab(line) rv.append(line) else: rv.append(line) return "\n".join(rv) def _is_loose_list(tokens: Iterable[Dict[str, Any]]) -> bool: paragraph_count = 0 for tok in tokens: if tok["type"] == "blank_line": return True if tok["type"] == "paragraph": paragraph_count += 1 if paragraph_count > 1: return True return False