Source code for markdown_environments.thms

import re
import xml.etree.ElementTree as etree

from bs4 import BeautifulSoup
from markdown.extensions import Extension
from markdown.postprocessors import Postprocessor
from markdown.treeprocessors import Treeprocessor

from . import utils


# the only reason this is a `Treeprocessor` and not a `Preprocessor`, `InlineProcessor`, or `Postprocessor`, all of
# which make more sense, is because we need this to run after `thms` (`BlockProcessor`) and before the TOC extension
# (`Treeprocessor` with low priority): `thms` generates `counter` syntax, while TOC will duplicate unparsed
# `counter` syntax from headings into the TOC and cause `counter` later to increment twice as much
class ThmCounterProcessor(Treeprocessor):

    PATTERN = re.compile(r"{{([0-9,]+)}}", flags=re.MULTILINE)

    def __init__(self, *args, add_html_elem: bool, html_id_prefix: str, html_class: str, **kwargs):
        super().__init__(*args, **kwargs)
        self.add_html_elem = add_html_elem
        self.html_id_prefix = html_id_prefix
        self.html_class = html_class
        self.counter = []

    def run(self, root):
        for child in root.iter():
            text = child.text
            if text is None:
                continue
            new_text = ""
            prev_match_end = 0
            for m in self.PATTERN.finditer(text):
                input_counter = m.group(1)
                parsed_counter = input_counter.split(",")
                # make sure we have enough room to parse counter into `self.counter`
                while len(parsed_counter) > len(self.counter):
                    self.counter.append(0)

                # parse counter
                for i, parsed_item in enumerate(parsed_counter):
                    try:
                        parsed_item = int(parsed_item)
                    except:
                        return False
                    self.counter[i] += parsed_item
                    # if changing current counter segment, reset all child segments back to 0
                    if parsed_item != 0 and len(parsed_counter) >= i + 1:
                        self.counter[i+1:] = [0] * (len(self.counter) - (i+1))

                # only output as many counter segments as were inputted
                output_counter = list(map(str, self.counter[:len(parsed_counter)]))
                output_counter_text = ".".join(output_counter)
                if self.add_html_elem:
                    elem = etree.Element("span")
                    elem.set("id", self.html_id_prefix + '-'.join(output_counter))
                    if self.html_class != "":
                        elem.set("class", self.html_class)
                    elem.text = output_counter_text
                    output_counter_text = etree.tostring(elem, encoding="unicode")

                # put changes into final output text
                new_text += text[prev_match_end:m.start()] + output_counter_text
                prev_match_end = m.end()
            new_text += text[prev_match_end:] # fill in remaining text after last regex match
            child.text = new_text


# `Postprocessor` instead of `Treeprocessor` to avoid placeholders for Markdown syntax in thm heading
class ThmHeadingProcessor(Postprocessor):

    PATTERN = re.compile(r"{\[(.+?)\]}(?:\[(.+?)\])?(?:{(.+?)})?\n", flags=re.MULTILINE)
    FORMAT_FOR_HTML_HYPHEN_PATTERN = re.compile(r"[ \./\u2013\u2014]", flags=re.MULTILINE)
    FORMAT_FOR_HTML_REMOVE_PATTERN = re.compile(r"[^A-Za-z0-9-]", flags=re.MULTILINE)

    def __init__(self, *args, html_id_prefix: str, html_class: str, emph_html_class: str, **kwargs):
        super().__init__(*args, **kwargs)
        self.html_id_prefix = html_id_prefix
        self.html_class = html_class
        self.emph_html_class = emph_html_class

    def run(self, text):
        def format_for_html(s: str) -> str:
            soup = BeautifulSoup(s, "html.parser") # remove any HTML tags
            s = soup.get_text()
            s = s.lower()
            s = self.FORMAT_FOR_HTML_HYPHEN_PATTERN.sub("-", s[:-1]) + s[-1] # don't have trailing hyphens since ugly
            s = self.FORMAT_FOR_HTML_REMOVE_PATTERN.sub("", s)
            return s

        new_text = ""
        prev_match_end = 0
        for m in self.PATTERN.finditer(text):
            thm_type = m.group(1)
            thm_name = m.group(2)
            thm_hidden_name = m.group(3)
            thm_punct = "."

            # create theorem heading element
            elem = etree.Element("span")
            if self.html_class != "":
                elem.set("class", self.html_class)
            # fill in theorem type + counter, and apply `emph` styling to this
            emph_elem = etree.SubElement(elem, "span")
            if self.emph_html_class != "":
                emph_elem.set("class", self.emph_html_class)
            emph_elem.text = f"{thm_type}"
            # fill in theorem name and hidden name
            if thm_name is not None:
                emph_elem.tail = f" ({thm_name})"
                elem.set("id", self.html_id_prefix + format_for_html(thm_name))
            elif thm_hidden_name is not None:
                elem.set("id", self.html_id_prefix + format_for_html(thm_hidden_name))
            # generate theorem punct HTML, applying `emph` styling to it as well (even if separated from
            # main `emph` section of thm type + counter by theorem name; this is default LaTeX behavior)
            thm_punct_elem = etree.SubElement(elem, "span")
            if self.emph_html_class != "":
                thm_punct_elem.set("class", self.emph_html_class)
            thm_punct_elem.text = thm_punct

            # convert all this to HTML and insert into final output, replacing the original match
            # unescape HTML that `tostring()` escapes to allow HTML and previously-rendered Markdown in thm heading
            new_text += text[prev_match_end:m.start()] \
                    + etree.tostring(elem, encoding="unicode").replace("&lt;", "<").replace("&gt;", ">")
            prev_match_end = m.end()
        new_text += text[prev_match_end:] # fill in remaining text after last regex match
        return new_text



[docs]
class ThmsExtension(Extension):
    r"""
    A wrapper around divs and dropdowns that provides more options to mimic the theorem capabilities of LaTeX.

    In particular, this extension introduces theorem headings and theorem counters, which are used in theorem
    environments but can also be used standalone as described below.

    Theorem headings:
        The terminology I use for the parts of a theorem heading throughout the documentation is as follows:
            
           .. code-block:: text

               Lemma 2.1.3 (Euler's theorem).
                 ^     ^           ^        ^
                thm   thm         thm      thm
               type counter       name    punct

        Markdown usage:
            .. code-block:: md

                {[<thm type><thm counter>]}[<optional thm name>]{<optional hidden thm name>}

            becomes…

            .. code-block:: html

                <span id="[thm name/hidden thm name]" class="[thm_heading_config's html_class]">
                  <span class="[thm_heading_config's emph_html_class]">[thm type][thm counter]</span>
                  [thm name]<span class="[thm_heading_config's emph_html_class]">.</span>
                </span>

        Note:
            `<optional hidden thm name>` is only used for the HTML `id`, and it is ignored if `<optional thm name>` is
            provided.

    Theorem counters:
        Theorem counters are specified as a (positive) offset from the previous theorem counter, similar to how
        `\\newtheorem` in LaTeX lets you define the counter (but hopefully in a slightly less janky way). Offsets are
        specified per segment, and incrementing a segment resets all following segments to 0. In addition, each counter
        will display only as many segments as provided in its Markdown.

        Usage:
            .. code-block:: md

                Section {{1}}
                Subsection {{0,1,0,0,0,0,0}} (displays as many segments as given)
                Lemma {{0,0,0,1}}
                Theorem {{0,0,1}} (the fourth counter segment is reset here). Let x be a lorem ipsum.
                Reevaluating Life Choices {{0,0,0,3}}
                What even is this {{1,2,0,3,9}} (first counter segment resets next ones, and so on)

            becomes…

            .. code-block:: html

                <p>Section 1</p>
                <p>Subsection 1.1.0.0.0.0.0 (displays as many segments as given)</p>
                <p>Lemma 1.1.0.1</p>
                <p>Theorem 1.1.1 (the fourth counter segment is reset here). Let x be a lorem ipsum.</p>
                <p>Reevaluating Life Choices 1.1.1.3</p>
                <p>What even is this 2.2.0.3.9 (first counter segment resets next ones, and so on)</p>

    Important:
        There cannot be spaces within the Markdown `{{}}` syntax for theorem counters.

    Usage:
        .. code-block:: py

            import markdown
            from markdown_environments import ThmsExtension

            input_text = ...
            output_text = markdown.markdown(input_text, extensions=[
                ThmsExtension(
                    div_config={
                        "types": {
                            "thm": {
                                "thm_type": "Theorem",
                                "html_class": "md-thm",
                                "thm_counter_incr": "0,0,1"
                            }
                        },
                        "html_class": "md-div"
                    },
                    dropdown_config={
                        "types": {
                            "exer": {
                                "thm_type": "Exercise",
                                "html_class": "md-exer",
                                "thm_counter_incr": "0,0,1"
                            }
                        },
                        "html_class": "md-dropdown",
                        "content_html_class": "md-dropdown__content"
                    },
                    thm_counter_config={
                        "add_html_elem": True,
                        "html_id_prefix": "spanish-inquisition"
                    },
                    thm_heading_config={
                        "html_class": "md-thm-heading"
                    }
                )
            ])

    Markdown usage (div-based):
        .. code-block:: md

            \begin{<type>}[<optional thm name>]{<optional hidden thm name>}
            <content>
            \end{<type>}

        becomes, with theorem heading and counter syntax…

        .. code-block:: md

            \begin{<type>}
            {[<type's thm type> {{<type's thm_counter_incr>}}]}[<thm name>]{<hidden thm name>}
            <content>
            \end{<type>}

        becomes…

        .. code-block:: html

            <div class="[html_class] [type's html_class]">
              <span id="[thm name/hidden thm name]" class="[thm_heading_config's html_class]">
                <span class="[thm_heading_config's emph_html_class]">[thm type][thm counter]</span>
                [thm name]<span class="[thm_heading_config's emph_html_class]">.</span>
              </span>
              [content]
            </div>

    Markdown usage (dropdown-based):
        .. code-block:: md

            \begin{<type>}[<optional thm name>]{<optional hidden thm name>}
            
            \begin{summary}
            <summary>
            \end{summary}

            <collapsible content>
            \end{<type>}

        becomes, with theorem heading and counter syntax…

        .. code-block:: md

            \begin{<type>}
            
            \begin{summary}
            {[<type's thm type> {{<type's thm_counter_incr>}}]}[<thm name>]{<hidden thm name>}
            <summary>
            \end{summary}

            <collapsible content>
            \end{<type>}

        becomes…

        .. code-block:: html

            <details class="[html_class] [type's html_class]">
              <summary class="[summary_html_class]">
                <span id="[thm name/hidden thm name]" class="[thm_heading_config's html_class]">
                  <span class="[thm_heading_config's emph_html_class]">[thm type][thm counter]</span>
                  [thm name]<span class="[thm_heading_config's emph_html_class]">.</span>
                </span>
                [summary]
              </summary>

              <div class="[content_html_class]">
                [collapsible content]
              </div>
            </details>

        Notice that with dropdowns, the theorem heading is prepended to the summary of the dropdown. In addition, the
        `\\begin{summary}` block is optional with theorems; if omitted, the summary will only include the theorem
        heading.
    """


[docs]
    def __init__(self, **kwargs):
        r"""
        Initialize dropdown extension, with configuration options passed as the following keyword arguments:

            - **div_config** (*dict*) -- configs for divs. Possible config keys are:

                - **types** (*dict*) -- Types of div-based theorem environments to define. Defaults to `{}`.
                - **html_class** (*str*) -- HTML `class` attribute to add to div-based theorem environments.
                  Defaults to `""`.

            - **dropdown_config** (*dict*) -- configs for dropdowns. Possible config keys are:

                - **types** (*dict*) -- Types of dropdown-based theorem environments to define. Defaults to `{}`.
                - **html_class** (*str*) -- HTML `class` attribute to add to dropdown-based theorem environments.
                  Defaults to `""`.
                - **summary_html_class** (*str*) -- HTML `class` attribute to add to dropdown summaries.
                  Defaults to `""`.
                - **content_html_class** (*str*) -- HTML `class` attribute to add to dropdown contents.
                  Defaults to `""`.

            - **thm_counter_config** (*dict*) -- configs for theorem counter. Possible config keys are:

                - **add_html_elem** (*bool*) -- Whether theorem counters are contained in their own HTML element.
                  Defaults to `False`.
                - **html_id_prefix** (*str*) -- Text to prepend to HTML `id` attribute of theorem counters if
                  `add_html_elem` is `True`; usually useful for linking. Defaults to `""`.
                - **html_class** (*str*) -- HTML `class` attribute to add to theorem counters if `add_html_elem` is
                  `True`. Defaults to `""`.

            - **thm_heading_config** (*dict*) -- configs for theorem headings. Possible config keys are:

                - **html_id_prefix** (*str*) -- Text to prepend to HTML `id` attribute of theorem headings (for all
                  theorem heading elements with `id` attributes). Defaults to `""`.
                - **html_class** (*str*) -- HTML `class` attribute to add to theorem headings. Defaults to `""`.
                - **emph_html_class** (*str*) -- HTML `class` attribute to add to theorem types in theorem headings.
                  Defaults to `""`.

        The key for each type defined in both `div_config`'s and `dropdown_config`'s `types` is inserted directly into
        the regex patterns that search for `\\begin{<type>}` and `\\end{<type>}`, so anything you specify will be
        interpreted as regex. However, if the key is an empty string, its regex will never be matched against, so it
        is effectively useless. In addition, each type's value in `types` is itself a dictionary with the following
        possible options:

            - **thm_type** (*str*) -- Theorem type actually displayed in theorem headings. Defaults to `""`.
            - **html_class** (*str*) -- HTML `class` attribute to add to theorems of that type. Defaults to `""`.
            - **thm_counter_incr** (*str*) -- Theorem counter inserted into theorem headings (again, no spaces!).
              Defaults to `""`; leave default to produce an unnumbered theorem type.
            - **thm_name_overrides_thm_heading** (*bool*) -- Whether the entire theorem heading besides the theorem
              punct should just be theorem name if a theorem name is provided, like the default behavior of
              `\\begin{proof}` environments in LaTeX. Defaults to `False`.
        """

        self.config = {
            "div_config": [
                {},
                "Config for div"
            ],
            "dropdown_config": [
                {},
                "Config for dropdown"
            ],
            "thm_counter_config": [
                {},
                "Config for theorem counter"
            ],
            "thm_heading_config": [
                {},
                "Config for theorem heading"
            ]
        }
        utils.init_extension_with_configs(self, **kwargs)

        # set default configs for each extension, since we no longer have the top-level `self.config` functionality
        # to set defaults for us
        div_config = self.getConfig("div_config")
        div_config.setdefault("types", {})
        div_config.setdefault("html_class", "")

        dropdown_config = self.getConfig("dropdown_config")
        dropdown_config.setdefault("types", {})
        dropdown_config.setdefault("html_class", "")
        dropdown_config.setdefault("summary_html_class", "")
        dropdown_config.setdefault("content_html_class", "")

        thm_counter_config = self.getConfig("thm_counter_config")
        thm_counter_config.setdefault("add_html_elem", False)
        thm_counter_config.setdefault("html_id_prefix", "")
        thm_counter_config.setdefault("html_class", "")

        thm_heading_config = self.getConfig("thm_heading_config")
        thm_heading_config.setdefault("html_id_prefix", "")
        thm_heading_config.setdefault("html_class", "")
        thm_heading_config.setdefault("emph_html_class", "")


    def extendMarkdown(self, md):
        # registering resets state between uses of `markdown.Markdown` object for things like the `ThmCounter` extension
        md.registerExtension(self)

        div_config = self.getConfig("div_config")
        dropdown_config = self.getConfig("dropdown_config")
        thm_counter_config = self.getConfig("thm_counter_config")
        thm_heading_config = self.getConfig("thm_heading_config")
        # remember `ThmCounter`'s priority must be higher than TOC extension
        md.treeprocessors.register(
            ThmCounterProcessor(
                md, add_html_elem=thm_counter_config.get("add_html_elem"),
                html_id_prefix=thm_counter_config.get("html_id_prefix"),
                html_class=thm_counter_config.get("html_class")
            ),
            "thm_counter", 999
        )
        md.postprocessors.register(
            ThmHeadingProcessor(
                md, html_id_prefix=thm_heading_config.get("html_id_prefix"),
                html_class=thm_heading_config.get("html_class"),
                emph_html_class=thm_heading_config.get("emph_html_class")
            ),
            "thm_heading", 105
        )

        if len(div_config.get("types", {})) > 0:
            from .div import DivProcessor
            md.parser.blockprocessors.register(
                DivProcessor(
                    md.parser, types=div_config.get("types"), html_class=div_config.get("html_class"), is_thm=True
                ),
                "thms_div", 105
            )
        if len(dropdown_config.get("types", {})) > 0:
            from .dropdown import DropdownProcessor
            md.parser.blockprocessors.register(
                DropdownProcessor(
                    md.parser, types=dropdown_config.get("types"), 
                    html_class=dropdown_config.get("html_class"),
                    summary_html_class=dropdown_config.get("summary_html_class"),
                    content_html_class=dropdown_config.get("content_html_class"),
                    is_thm=True
                ),
                "thms_dropdown", 999
            )



def makeExtension(**kwargs):
    return ThmsExtension(**kwargs)