Commit 75f58779 authored by mvdbeek's avatar mvdbeek
Browse files

Log exceptions IT port monitor fails and stop job

That should make it significantly easier to debug issues when the
container monitor dies unexpectedly (which I sort of expect in the
future given that we parse output from the docker CLI).
parent a1ececdc
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -2125,6 +2125,15 @@ class JobWrapper(HasResourceParameters):
            log.debug("found container runtime %s" % container_runtime)
            self.app.interactivetool_manager.configure_entry_points(job, container_runtime)
            return True
        container_exception_path = os.path.join(working_directory, "container_monitor_exception.txt")
        if os.path.exists(container_exception_path):
            with open(container_exception_path) as fh:
                exception_string = fh.read()
            error_message = "Monitoring interactive tool entry point failed"
            log.error("Monitoring interactive tool entry point for job {} failed: {}".format(self.job_id, exception_string))
            self.fail(error_message)
            # local job runner uses return value to determine if we're done polling
            return True

    def container_monitor_command(self, container, **kwds):
        if not container or not self.tool.produces_entry_points:
+8 −3
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ import subprocess
import sys
import tempfile
import time
import traceback

# insert *this* galaxy before all others on sys.path
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)))
@@ -44,6 +45,7 @@ def main():
        raise Exception("Monitoring container type [%s], not yet implemented." % container_type)

    ports_raw = None
    exc_traceback = ""
    for i in range(10):
        try:
            ports_raw = parse_ports(container_name, connection_configuration)
@@ -62,10 +64,13 @@ def main():
                break
            else:
                raise Exception("Failed to recover ports...")
        except Exception as e:
            with open("container_monitor_exception.txt", "a") as f:
                f.write(str(e) + "\n\n\n")
        except Exception:
            exc_info = sys.exc_info()
            exc_traceback = "".join(traceback.format_exception(*exc_info))
        time.sleep(i * 2)
    else:
        with open("container_monitor_exception.txt", "a") as f:
            f.write(exc_traceback)


if __name__ == "__main__":
+6 −1
Original line number Diff line number Diff line
@@ -70,11 +70,16 @@ class BaseInteractiveToolsIntegrationTestCase(ContainerizedIntegrationTestCase):
            if len(entry_points) != expected_num:
                return None
            elif any([not e["active"] for e in entry_points]):
                job_json = self._get("jobs/%s?full=true" % job_id).json()
                if job_json['state'] == 'error':
                    raise Exception("Interactive tool job {} failed: {}".format(job_id, job_json))
                return None
            else:
                return entry_points

        return wait_on(active_entry_points, "entry points to become active")
        # It currently takes at least 90 seconds until we can be sure the container monitor failed.
        # Can be decreased when galaxy_ext/container_monitor/monitor.py changes
        return wait_on(active_entry_points, "entry points to become active", timeout=120)

    def entry_points_for_job(self, job_id):
        entry_points_response = self._get("entry_points?job_id=%s" % job_id)