Commit 81cf1732 authored by pennae's avatar pennae Committed by pennae
Browse files

nixos-render-docs: use multiprocessing for options

options processing is pretty slow right now, mostly because the
markdown-it-py parser is pure python (and with performance
pessimizations at that). options parsing *is* embarassingly parallel
though, so we can just fork out all the work to worker processes and
collect the results.

multiprocessing probably has a greater benefit on linux than on darwin
since the worker spawning method darwin uses is less efficient than
fork() on linux. this hasn't been tested on darwin, only on linux, but
if anything darwin will be faster with its preferred method.
parent 5f4e07de
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -158,7 +158,7 @@ let
          '@NIXOS_TEST_OPTIONS_JSON@' \
          ${testOptionsDoc.optionsJSON}/share/doc/nixos/options.json

      nixos-render-docs manual docbook \
      nixos-render-docs -j $NIX_BUILD_CORES manual docbook \
        --manpage-urls ${manpageUrls} \
        --revision ${lib.escapeShellArg revision} \
        ./manual.md \
@@ -285,7 +285,7 @@ in rec {
        ''
        else ''
          mkdir -p $out/share/man/man5
          nixos-render-docs options manpage \
          nixos-render-docs -j $NIX_BUILD_CORES options manpage \
            --revision ${lib.escapeShellArg revision} \
            ${optionsJSON}/share/doc/nixos/options.json \
            $out/share/man/man5/configuration.nix.5
+1 −1
Original line number Diff line number Diff line
@@ -152,7 +152,7 @@ in rec {
      pkgs.nixos-render-docs
    ];
  } ''
    nixos-render-docs options docbook \
    nixos-render-docs -j $NIX_BUILD_CORES options docbook \
      --manpage-urls ${pkgs.path + "/doc/manpage-urls.json"} \
      --revision ${lib.escapeShellArg revision} \
      --document-type ${lib.escapeShellArg documentType} \
+3 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ from typing import Any, Dict
from .md import Converter
from . import manual
from . import options
from . import parallel

def pretty_print_exc(e: BaseException, *, _desc_text: str = "error") -> None:
    print(f"\x1b[1;31m{_desc_text}:\x1b[0m", file=sys.stderr)
@@ -35,6 +36,7 @@ def pretty_print_exc(e: BaseException, *, _desc_text: str = "error") -> None:

def main() -> None:
    parser = argparse.ArgumentParser(description='render nixos manual bits')
    parser.add_argument('-j', '--jobs', type=int, default=None)

    commands = parser.add_subparsers(dest='command', required=True)

@@ -43,6 +45,7 @@ def main() -> None:

    args = parser.parse_args()
    try:
        parallel.pool_processes = args.jobs
        if args.command == 'options':
            options.run_cli(args)
        elif args.command == 'manual':
+41 −4
Original line number Diff line number Diff line
from __future__ import annotations

import argparse
import json

@@ -10,6 +12,7 @@ from xml.sax.saxutils import escape, quoteattr

import markdown_it

from . import parallel
from .docbook import DocBookRenderer, make_xml_id
from .manpage import ManpageRenderer, man_escape
from .md import Converter, md_escape
@@ -148,15 +151,33 @@ class BaseConverter(Converter):

        return [ l for part in blocks for l in part ]

    # this could return a TState parameter, but that does not allow dependent types and
    # will cause headaches when using BaseConverter as a type bound anywhere. Any is the
    # next best thing we can use, and since this is internal it will be mostly safe.
    @abstractmethod
    def _parallel_render_prepare(self) -> Any: raise NotImplementedError()
    # this should return python 3.11's Self instead to ensure that a prepare+finish
    # round-trip ends up with an object of the same type. for now we'll use BaseConverter
    # since it's good enough so far.
    @classmethod
    @abstractmethod
    def _parallel_render_init_worker(cls, a: Any) -> BaseConverter: raise NotImplementedError()

    def _render_option(self, name: str, option: dict[str, Any]) -> RenderedOption:
        try:
            return RenderedOption(option['loc'], self._convert_one(option))
        except Exception as e:
            raise Exception(f"Failed to render option {name}") from e

    @classmethod
    def _parallel_render_step(cls, s: BaseConverter, a: Any) -> RenderedOption:
        return s._render_option(*a)

    def add_options(self, options: dict[str, Any]) -> None:
        for (name, option) in options.items():
            self._options[name] = self._render_option(name, option)
        mapped = parallel.map(self._parallel_render_step, options.items(), 100,
                              self._parallel_render_init_worker, self._parallel_render_prepare())
        for (name, option) in zip(options.keys(), mapped):
            self._options[name] = option

    @abstractmethod
    def finalize(self) -> str: raise NotImplementedError()
@@ -194,6 +215,13 @@ class DocBookConverter(BaseConverter):
        self._varlist_id = varlist_id
        self._id_prefix = id_prefix

    def _parallel_render_prepare(self) -> Any:
        return (self._manpage_urls, self._revision, self._markdown_by_default, self._document_type,
                self._varlist_id, self._id_prefix)
    @classmethod
    def _parallel_render_init_worker(cls, a: Any) -> DocBookConverter:
        return cls(*a)

    def _render_code(self, option: dict[str, Any], key: str) -> list[str]:
        if lit := option_is(option, key, 'literalDocBook'):
            return [ f"<para><emphasis>{key.capitalize()}:</emphasis> {lit['text']}</para>" ]
@@ -283,10 +311,19 @@ class ManpageConverter(BaseConverter):
    _options_by_id: dict[str, str]
    _links_in_last_description: Optional[list[str]] = None

    def __init__(self, revision: str, markdown_by_default: bool):
        self._options_by_id = {}
    def __init__(self, revision: str, markdown_by_default: bool,
                 *,
                 # only for parallel rendering
                 _options_by_id: Optional[dict[str, str]] = None):
        self._options_by_id = _options_by_id or {}
        super().__init__({}, revision, markdown_by_default)

    def _parallel_render_prepare(self) -> Any:
        return ((self._revision, self._markdown_by_default), { '_options_by_id': self._options_by_id })
    @classmethod
    def _parallel_render_init_worker(cls, a: Any) -> ManpageConverter:
        return cls(*a[0], **a[1])

    def _render_option(self, name: str, option: dict[str, Any]) -> RenderedOption:
        assert isinstance(self._md.renderer, OptionsManpageRenderer)
        links = self._md.renderer.link_footnotes = []
+58 −0
Original line number Diff line number Diff line
# this module only has to exist because cpython has a global interpreter lock
# and markdown-it is pure python code. ideally we'd just use thread pools, but
# the GIL prohibits this.

import multiprocessing

from typing import Any, Callable, ClassVar, Iterable, Optional, TypeVar

R = TypeVar('R')
S = TypeVar('S')
T = TypeVar('T')
A = TypeVar('A')

pool_processes: Optional[int] = None

# this thing is impossible to type because there's so much global state involved.
# wrapping in a class to get access to Generic[] parameters is not sufficient
# because mypy is too weak, and unnecessarily obscures how much global state is
# needed in each worker to make this whole brouhaha work.
_map_worker_fn: Any = None
_map_worker_state_fn: Any = None
_map_worker_state_arg: Any = None

def _map_worker_init(*args: Any) -> None:
    global _map_worker_fn, _map_worker_state_fn, _map_worker_state_arg
    (_map_worker_fn, _map_worker_state_fn, _map_worker_state_arg) = args

# NOTE: the state argument is never passed by any caller, we only use it as a localized
# cache for the created state in lieu of another global. it is effectively a global though.
def _map_worker_step(arg: Any, state: Any = []) -> Any:
    global _map_worker_fn, _map_worker_state_fn, _map_worker_state_arg
    # if a Pool initializer throws it'll just be retried, leading to endless loops.
    # doing the proper initialization only on first use avoids this.
    if not state:
        state.append(_map_worker_state_fn(_map_worker_state_arg))
    return _map_worker_fn(state[0], arg)

def map(fn: Callable[[S, T], R], d: Iterable[T], chunk_size: int,
        state_fn: Callable[[A], S], state_arg: A) -> list[R]:
    """
    `[ fn(state, i) for i in d ]`  where `state = state_fn(state_arg)`, but using multiprocessing
    if `pool_processes` is not `None`. when using multiprocessing is used the state function will
    be run once in ever worker process and `multiprocessing.Pool.imap` will be used.

    **NOTE:** neither `state_fn` nor `fn` are allowed to mutate global state! doing so will cause
    discrepancies if `pool_processes` is not None, since each worker will have its own copy.

    **NOTE**: all data types that potentially cross a process boundary (so, all of them) must be
    pickle-able. this excludes lambdas, bound functions, local functions, and a number of other
    types depending on their exact internal structure. *theoretically* the pool constructor
    can transfer non-pickleable data to worker processes, but this only works when using the
    `fork` spawn method (and is thus not available on darwin or windows).
    """
    if pool_processes is None:
        state = state_fn(state_arg)
        return [ fn(state, i) for i in d ]
    with multiprocessing.Pool(pool_processes, _map_worker_init, (fn, state_fn, state_arg)) as p:
        return list(p.imap(_map_worker_step, d, chunk_size))