Commit bd6fe02a authored by mvdbeek's avatar mvdbeek
Browse files

Fix set metadata for primary discovered outputs

We were previously setting metadata on the path that `filename_override`
points to, which (should be an output in the job working directory, or an
output in the object store.

That does not work for `unnnamed_outputs` that are inferred via
galaxy.json and that are discovered using collect_dynamic_outputs. In
most cases that doesn't lead to an exception, but it does fail for bam
files (and a small subset of other datatypes I would guess) if you
specify the extension explicitly (if the extension is not set, sniffing
on the empty dataset results in `data`, which will prevent the
exception). For datatypes that don't fail in `set_metadata` on empty
input files metadata would then be set again when collecting dynamic
outputs, this time using the correct path.

The fix here is to check whether the file we're setting metadata for is
actually an unnamed output, and override the path with the path to the
file in the working directory. To prevent setting metadata twice we also
skip setting metadata on files that already have a database id (which
should only be these primary discovered outputs).
parent 572584cb
Loading
Loading
Loading
Loading
+26 −15
Original line number Diff line number Diff line
@@ -139,6 +139,7 @@ def set_metadata_portable():
    version_string = ""

    export_store = None
    final_job_state = 'ok'
    if extended_metadata_collection:
        tool_dict = metadata_params["tool"]
        stdio_exit_code_dicts, stdio_regex_dicts = tool_dict["stdio_exit_codes"], tool_dict["stdio_regexes"]
@@ -185,7 +186,7 @@ def set_metadata_portable():
        if os.path.exists(COMMAND_VERSION_FILENAME):
            version_string = open(COMMAND_VERSION_FILENAME).read()

        job_context = ExpressionContext(dict(stdout=tool_stdout, stderr=tool_stderr))
        expression_context = ExpressionContext(dict(stdout=tool_stdout, stderr=tool_stderr))

        # Load outputs.
        export_store = store.DirectoryModelExportStore('metadata/outputs_populated', serialize_dataset_objects=True, for_edit=True, strip_metadata_files=False, serialize_jobs=False)
@@ -195,6 +196,27 @@ def set_metadata_portable():
        # Remove in 21.09, this should only happen for jobs that started on <= 20.09 and finish now
        import_model_store = None

    job_context = SessionlessJobContext(
        metadata_params,
        tool_provided_metadata,
        object_store,
        export_store,
        import_model_store,
        os.path.join(tool_job_working_directory, "working"),
        final_job_state=final_job_state,
    )

    unnamed_id_to_path = {}
    for unnamed_output_dict in job_context.tool_provided_metadata.get_unnamed_outputs():
        destination = unnamed_output_dict["destination"]
        elements = unnamed_output_dict["elements"]
        destination_type = destination["type"]
        if destination_type == 'hdas':
            for element in elements:
                filename = element.get('filename')
                if filename:
                    unnamed_id_to_path[element['object_id']] = os.path.join(job_context.job_working_directory, filename)

    for output_name, output_dict in outputs.items():
        dataset_instance_id = output_dict["id"]
        klass = getattr(galaxy.model, output_dict.get('model_class', 'HistoryDatasetAssociation'))
@@ -219,7 +241,7 @@ def set_metadata_portable():
        # Same block as below...
        set_meta_kwds = stringify_dictionary_keys(json.load(open(filename_kwds)))  # load kwds; need to ensure our keywords are not unicode
        try:
            dataset.dataset.external_filename = dataset_filename_override
            dataset.dataset.external_filename = unnamed_id_to_path.get(dataset_instance_id, dataset_filename_override)
            store_by = output_dict.get("object_store_store_by", legacy_object_store_store_by)
            extra_files_dir_name = "dataset_%s_files" % getattr(dataset.dataset, store_by)
            files_path = os.path.abspath(os.path.join(tool_job_working_directory, "working", extra_files_dir_name))
@@ -240,9 +262,9 @@ def set_metadata_portable():
            if extended_metadata_collection:
                meta = tool_provided_metadata.get_dataset_meta(output_name, dataset.dataset.id, dataset.dataset.uuid)
                if meta:
                    context = ExpressionContext(meta, job_context)
                    context = ExpressionContext(meta, expression_context)
                else:
                    context = job_context
                    context = expression_context

                # Lazy and unattached
                # if getattr(dataset, "hidden_beneath_collection_instance", None):
@@ -301,17 +323,6 @@ def set_metadata_portable():

    if extended_metadata_collection:
        # discover extra outputs...

        job_context = SessionlessJobContext(
            metadata_params,
            tool_provided_metadata,
            object_store,
            export_store,
            import_model_store,
            os.path.join(tool_job_working_directory, "working"),
            final_job_state=final_job_state,
        )

        output_collections = {}
        for name, output_collection in metadata_params["output_collections"].items():
            output_collections[name] = import_model_store.sa_session.query(HistoryDatasetCollectionAssociation).find(output_collection["id"])
+2 −1
Original line number Diff line number Diff line
@@ -171,7 +171,8 @@ class ModelPersistenceContext(metaclass=abc.ABCMeta):
        if info is not None:
            primary_data.info = info

        if filename:
        if filename and not primary_data.id:
            # If primary data has an id it should already have gone through the metadata process
            self.set_datasets_metadata(datasets=[primary_data], datasets_attributes=[dataset_attributes])

        return primary_data