Unverified Commit 2a50c1a6 authored by Jeremy Fleischman's avatar Jeremy Fleischman
Browse files

nixos/test-driver: stop blackholing `vde_switch` stderr

The code seems to have been ported from the old Perl code. It worked, but
had some smells:

1. It swallowed all `stderr` into a pipe we never read. In addition to
   making things difficult to debug, it could potentially result in
   `vde_switch` hanging if it gets blocked trying to write to a pipe
   that's full. Solution: just let `vde_switch` inherit our `stderr`.
2. If `vde_switch` failed to start, we just logged a warning and
   continued on. Now we crash.
3. We allocated a PTY to communicate with `vde_switch`'s management
   REPL. This was unnecessary: non-TTY stdin/stdout works fine.
4. There was an old TODO here about not blocking forever trying to read
   `vde_switch`'s stdout. I've addressed that by using `select` with a
   timeout.

NOTE: this does change the API surface area of the `VLan` class a bit:
there's no longer a `fd` attribute that points at a PTY master. I did an
unscientific grep through `nixos/tests` and couldn't find references to
`vlan[0-9].fd` nor `vlans[...].fd`.
parent a377fb95
Loading
Loading
Loading
Loading
+78 −18
Original line number Diff line number Diff line
import datetime as dt
import fcntl
import io
import os
import pty
import select
import subprocess
import typing
from pathlib import Path

from test_driver.logger import AbstractLogger


def readline_with_timeout(
    readable: typing.IO[str], timeout: dt.timedelta
) -> typing.Generator[str]:
    """
    Read a line from `readable` within the given `timeout`, otherwise raises `TimeoutError`.

    Note: while the generator is running, `readable` will be in nonblocking mode.
    """
    fd = readable.fileno()
    og_flags = fcntl.fcntl(fd, fcntl.F_GETFL)
    fcntl.fcntl(fd, fcntl.F_SETFL, og_flags | os.O_NONBLOCK)

    try:
        while True:
            ready, _, _ = select.select([readable], [], [], timeout.total_seconds())
            if len(ready) == 0:
                raise TimeoutError()

            # Under the hood, `readline` may read more than one line from the file descriptor,
            # so we cannot just return to the `select`, as it may block, despite there being more
            # lines buffered. So, read all the lines before returning to the select. This only
            # works if the file descriptor is in non-blocking mode!
            while line := readable.readline():
                yield line
    finally:
        fcntl.fcntl(fd, fcntl.F_SETFL, og_flags)


class VLan:
    """This class handles a VLAN that the run-vm scripts identify via its
    number handles. The network's lifetime equals the object's lifetime.
@@ -33,33 +64,62 @@ class VLan:
        os.environ[f"QEMU_VDE_SOCKET_{self.nr}"] = str(self.socket_dir)

        self.logger.info("start vlan")
        pty_master, pty_slave = pty.openpty()

        self.process = subprocess.Popen(
            [
                "vde_switch",
                "--sock",
                self.socket_dir,
                "--dirmode",
                "0700",
                # The --hub is required for the scenario determined by
        # nixos/tests/networking.nix vlan-ping.
        # VLAN Tagged traffic (802.1Q) seams to be blocked if a vde_switch is
                # nixos/tests/networkd-and-scripted.nix vlan-ping.
                # VLAN Tagged traffic (802.1Q) seems to be blocked if a vde_switch is
                # used without the hub mode (flood packets to all ports).
        self.process = subprocess.Popen(
            ["vde_switch", "-s", self.socket_dir, "--dirmode", "0700", "--hub"],
            stdin=pty_slave,
                "--hub",
            ],
            bufsize=1,  # Line buffered.
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            shell=False,
            stderr=None,  # Do not swallow stderr.
            text=True,
        )
        self.pid = self.process.pid
        self.fd = os.fdopen(pty_master, "w")
        self.fd.write("version\n")

        # TODO: perl version checks if this can be read from
        # an if not, dies. we could hang here forever. Fix it.
        assert self.process.stdin is not None
        self.process.stdin.write("showinfo\n")

        # showinfo's output looks like this:
        #
        # ```
        # vde$ showinfo
        # 0000 DATA END WITH '.'
        # VDE switch V.2.3.3
        # (C) Virtual Square Team (coord. R. Davoli) 2005,2006,2007 - GPLv2
        #
        # pid 82406 MAC 00:ff:62:25:47:55 uptime 45
        # .
        # 1000 Success
        # ```
        #
        # We read past all the output until we get to the `1000 Success`.
        # This serves 2 purposes:
        #   1. It's a nice sanity check that `vde_switch` is actually working.
        #   2. By the time we're done, `vde_switch` will have created the
        #      `ctl` socket in `socket_dir`, so we don't have to wait for it to exist.
        assert self.process.stdout is not None
        self.process.stdout.readline()
        if not (self.socket_dir / "ctl").exists():
            self.logger.error("cannot start vde_switch")
        for line in readline_with_timeout(
            self.process.stdout, timeout=dt.timedelta(seconds=5)
        ):
            if "1000 Success" in line:
                break

        assert (self.socket_dir / "ctl").exists(), "cannot start vde_switch"

        self.logger.info(f"running vlan (pid {self.pid}; ctl {self.socket_dir})")

    def stop(self) -> None:
        self.logger.info(f"kill vlan (pid {self.pid})")
        self.fd.close()
        assert self.process.stdin is not None
        self.process.stdin.close()
        self.process.terminate()