Unverified Commit 28d25829 authored by John Chilton's avatar John Chilton Committed by GitHub
Browse files

Merge pull request #14181 from mvdbeek/deferred_data_fixes

[22.05] Deferred data fixes
parents e2e223a8 df6551da
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ export const STATES = {
    },
    deferred: {
        status: "info",
        text: "This dataset is remote, has not be ingested by Galaxy, and full metadata may not be available.",
        text: "This dataset is remote, has not been ingested by Galaxy, and full metadata may not be available.",
        icon: "cloud",
    },
    /** has no data */
+2 −2
Original line number Diff line number Diff line
@@ -434,7 +434,7 @@ def set_metadata_portable(
                if not object_store or not export_store:
                    # Can't happen, but type system doesn't know
                    raise Exception("object_store not built")
                if not link_data_only and os.path.getsize(external_filename):
                if not is_deferred and not link_data_only and os.path.getsize(external_filename):
                    # Here we might be updating a disk based objectstore when outputs_to_working_directory is used,
                    # or a remote object store from its cache path.
                    object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True)
@@ -469,7 +469,7 @@ def set_metadata_portable(
                        context_value = context[context_key]
                        setattr(dataset, context_key, context_value)
                # We only want to persist the external_filename if the dataset has been linked in.
                if not link_data_only:
                if not is_deferred and not link_data_only:
                    dataset.dataset.external_filename = None
                    dataset.dataset.extra_files_path = None
                export_store.add_dataset(dataset)
+10 −6
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ log = logging.getLogger(__name__)
class TransientDatasetPaths(NamedTuple):
    external_filename: str
    external_extra_files_path: str
    metadata_files_dir: str


class TransientPathMapper:
@@ -61,7 +62,7 @@ class SimpleTransientPathMapper(TransientPathMapper):
        external_filename = os.path.join(self._staging_directory, external_filename_basename)
        external_extras_basename = "dataset_%s_files" % str(old_dataset.uuid)
        external_extras = os.path.join(self._staging_directory, external_extras_basename)
        return TransientDatasetPaths(external_filename, external_extras)
        return TransientDatasetPaths(external_filename, external_extras, self._staging_directory)


class DatasetInstanceMaterializer:
@@ -110,6 +111,7 @@ class DatasetInstanceMaterializer:
        materialized_dataset.hashes = [h.copy() for h in dataset.hashes]

        target_source = self._find_closest_dataset_source(dataset)
        transient_paths = None
        if attached:
            object_store_populator = self._object_store_populator
            assert object_store_populator
@@ -157,12 +159,14 @@ class DatasetInstanceMaterializer:
            materialized_dataset_instance.has_metadata_files or materialized_dataset_instance.metadata_deferred
        )
        if require_metadata_regeneration:
            if attached and self._sa_session:
                # as of mid April 2022, we now get JSON encoding errors if this
                # isn't bound to the session before metadata generation.
                self._sa_session.add(materialized_dataset_instance)
            materialized_dataset_instance.init_meta()
            materialized_dataset_instance.set_meta()
            if transient_paths:
                metadata_tmp_files_dir = transient_paths.metadata_files_dir
            else:
                # If metadata_tmp_files_dir is set we generate a MetdataTempFile,
                # which we don't want when we're generating an attached materialized dataset instance
                metadata_tmp_files_dir = None
            materialized_dataset_instance.set_meta(metadata_tmp_files_dir=metadata_tmp_files_dir)
            materialized_dataset_instance.metadata_deferred = False
        return materialized_dataset_instance

+48 −0
Original line number Diff line number Diff line
@@ -2729,6 +2729,54 @@ class ToolsTestCase(ApiTestCase, TestsTools):
        output_content = self.dataset_populator.get_history_dataset_content(history_id, dataset=output)
        assert output_content.startswith("chr1	147962192	147962580	CCDS989.1_cds_0_0_chr1_147962193_r	0	-")

    @skip_without_tool("metadata_bam")
    @uses_test_history(require_new=False)
    def test_run_deferred_dataset_with_metadata_options_filter(self, history_id):
        details = self.dataset_populator.create_deferred_hda(
            history_id, "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam", ext="bam"
        )
        inputs = {"input_bam": dataset_to_param(details), "ref_names": "chrM"}
        run_response = self.dataset_populator.run_tool(tool_id="metadata_bam", inputs=inputs, history_id=history_id)
        output = run_response["outputs"][0]
        output_details = self.dataset_populator.get_history_dataset_details(
            history_id, dataset=output, wait=True, assert_ok=True
        )
        assert output_details["state"] == "ok"
        output_content = self.dataset_populator.get_history_dataset_content(history_id, dataset=output)
        assert output_content.startswith("chrM")

    @skip_without_tool("pileup")
    @uses_test_history(require_new=False)
    def test_metadata_validator_on_deferred_input(self, history_id):
        deferred_bam_details = self.dataset_populator.create_deferred_hda(
            history_id, "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam", ext="bam"
        )
        fasta1_contents = open(self.get_filename("1.fasta")).read()
        fasta = self.dataset_populator.new_dataset(history_id, content=fasta1_contents)
        inputs = {"input1": dataset_to_param(deferred_bam_details), "reference": dataset_to_param(fasta)}
        run_response = self.dataset_populator.run_tool(tool_id="pileup", inputs=inputs, history_id=history_id)
        self.dataset_populator.wait_for_job(run_response["jobs"][0]["id"], assert_ok=True)

    @pytest.mark.xfail
    @skip_without_tool("pileup")
    @uses_test_history(require_new=False)
    def test_metadata_validator_can_fail_on_deferred_input(self, history_id):
        # This test fails because we just skip the validator
        # Fixing this is a TODO
        deferred_bam_details = self.dataset_populator.create_deferred_hda(
            history_id,
            "https://github.com/galaxyproject/galaxy/blob/dev/test-data/3unsorted.bam?raw=true",
            ext="unsorted.bam",
        )
        fasta1_contents = open(self.get_filename("1.fasta")).read()
        fasta = self.dataset_populator.new_dataset(history_id, content=fasta1_contents)
        inputs = {"input1": dataset_to_param(deferred_bam_details), "reference": dataset_to_param(fasta)}
        run_response = self.dataset_populator.run_tool(tool_id="pileup", inputs=inputs, history_id=history_id)
        self.dataset_populator.wait_for_job(run_response["jobs"][0]["id"], assert_ok=False)
        job_id = run_response["jobs"][0]["id"]
        job_details = self.dataset_populator.get_job_details(job_id=job_id).json()
        assert job_details["state"] == "failed"

    @skip_without_tool("cat1")
    @uses_test_history(require_new=False)
    def test_run_deferred_mapping(self, history_id: str):
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
printf 'Summary' > '$out_file1'
    ]]></command>
    <inputs>
        <param name="input1" type="data" format="bam" multiple="true" min="1" label="BAM Inputs">
        <param name="input1" type="data" format="unsorted.bam" multiple="true" min="1" label="BAM Inputs">
            <validator check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue." type="metadata" />
        </param>
        <param name="reference" type="data" format="fasta" label="Fasta Input"/>
Loading