Merge pull request #14181 from mvdbeek/deferred_data_fixes (28d25829) · Commits · NDIP / Galaxy

client/src/components/History/Content/model/states.js

+1 −1

Original line number	Diff line number	Diff line
		@@ -11,7 +11,7 @@ export const STATES = {
		},
		deferred: {
		status: "info",
		text: "This dataset is remote, has not be ingested by Galaxy, and full metadata may not be available.",
		text: "This dataset is remote, has not been ingested by Galaxy, and full metadata may not be available.",
		icon: "cloud",
		},
		/** has no data */

lib/galaxy/metadata/set_metadata.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -434,7 +434,7 @@ def set_metadata_portable(
		if not object_store or not export_store:
		# Can't happen, but type system doesn't know
		raise Exception("object_store not built")
		if not link_data_only and os.path.getsize(external_filename):
		if not is_deferred and not link_data_only and os.path.getsize(external_filename):
		# Here we might be updating a disk based objectstore when outputs_to_working_directory is used,
		# or a remote object store from its cache path.
		object_store.update_from_file(dataset.dataset, file_name=external_filename, create=True)
		@@ -469,7 +469,7 @@ def set_metadata_portable(
		context_value = context[context_key]
		setattr(dataset, context_key, context_value)
		# We only want to persist the external_filename if the dataset has been linked in.
		if not link_data_only:
		if not is_deferred and not link_data_only:
		dataset.dataset.external_filename = None
		dataset.dataset.extra_files_path = None
		export_store.add_dataset(dataset)

lib/galaxy/model/deferred.py

+10 −6

Original line number	Diff line number	Diff line
		@@ -40,6 +40,7 @@ log = logging.getLogger(__name__)
		class TransientDatasetPaths(NamedTuple):
		external_filename: str
		external_extra_files_path: str
		metadata_files_dir: str


		class TransientPathMapper:
		@@ -61,7 +62,7 @@ class SimpleTransientPathMapper(TransientPathMapper):
		external_filename = os.path.join(self._staging_directory, external_filename_basename)
		external_extras_basename = "dataset_%s_files" % str(old_dataset.uuid)
		external_extras = os.path.join(self._staging_directory, external_extras_basename)
		return TransientDatasetPaths(external_filename, external_extras)
		return TransientDatasetPaths(external_filename, external_extras, self._staging_directory)


		class DatasetInstanceMaterializer:
		@@ -110,6 +111,7 @@ class DatasetInstanceMaterializer:
		materialized_dataset.hashes = [h.copy() for h in dataset.hashes]

		target_source = self._find_closest_dataset_source(dataset)
		transient_paths = None
		if attached:
		object_store_populator = self._object_store_populator
		assert object_store_populator
		@@ -157,12 +159,14 @@ class DatasetInstanceMaterializer:
		materialized_dataset_instance.has_metadata_files or materialized_dataset_instance.metadata_deferred
		)
		if require_metadata_regeneration:
		if attached and self._sa_session:
		# as of mid April 2022, we now get JSON encoding errors if this
		# isn't bound to the session before metadata generation.
		self._sa_session.add(materialized_dataset_instance)
		materialized_dataset_instance.init_meta()
		materialized_dataset_instance.set_meta()
		if transient_paths:
		metadata_tmp_files_dir = transient_paths.metadata_files_dir
		else:
		# If metadata_tmp_files_dir is set we generate a MetdataTempFile,
		# which we don't want when we're generating an attached materialized dataset instance
		metadata_tmp_files_dir = None
		materialized_dataset_instance.set_meta(metadata_tmp_files_dir=metadata_tmp_files_dir)
		materialized_dataset_instance.metadata_deferred = False
		return materialized_dataset_instance

lib/galaxy_test/api/test_tools.py

+48 −0

Original line number	Diff line number	Diff line
		@@ -2729,6 +2729,54 @@ class ToolsTestCase(ApiTestCase, TestsTools):
		output_content = self.dataset_populator.get_history_dataset_content(history_id, dataset=output)
		assert output_content.startswith("chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 -")

		@skip_without_tool("metadata_bam")
		@uses_test_history(require_new=False)
		def test_run_deferred_dataset_with_metadata_options_filter(self, history_id):
		details = self.dataset_populator.create_deferred_hda(
		history_id, "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam", ext="bam"
		)
		inputs = {"input_bam": dataset_to_param(details), "ref_names": "chrM"}
		run_response = self.dataset_populator.run_tool(tool_id="metadata_bam", inputs=inputs, history_id=history_id)
		output = run_response["outputs"][0]
		output_details = self.dataset_populator.get_history_dataset_details(
		history_id, dataset=output, wait=True, assert_ok=True
		)
		assert output_details["state"] == "ok"
		output_content = self.dataset_populator.get_history_dataset_content(history_id, dataset=output)
		assert output_content.startswith("chrM")

		@skip_without_tool("pileup")
		@uses_test_history(require_new=False)
		def test_metadata_validator_on_deferred_input(self, history_id):
		deferred_bam_details = self.dataset_populator.create_deferred_hda(
		history_id, "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam", ext="bam"
		)
		fasta1_contents = open(self.get_filename("1.fasta")).read()
		fasta = self.dataset_populator.new_dataset(history_id, content=fasta1_contents)
		inputs = {"input1": dataset_to_param(deferred_bam_details), "reference": dataset_to_param(fasta)}
		run_response = self.dataset_populator.run_tool(tool_id="pileup", inputs=inputs, history_id=history_id)
		self.dataset_populator.wait_for_job(run_response["jobs"][0]["id"], assert_ok=True)

		@pytest.mark.xfail
		@skip_without_tool("pileup")
		@uses_test_history(require_new=False)
		def test_metadata_validator_can_fail_on_deferred_input(self, history_id):
		# This test fails because we just skip the validator
		# Fixing this is a TODO
		deferred_bam_details = self.dataset_populator.create_deferred_hda(
		history_id,
		"https://github.com/galaxyproject/galaxy/blob/dev/test-data/3unsorted.bam?raw=true",
		ext="unsorted.bam",
		)
		fasta1_contents = open(self.get_filename("1.fasta")).read()
		fasta = self.dataset_populator.new_dataset(history_id, content=fasta1_contents)
		inputs = {"input1": dataset_to_param(deferred_bam_details), "reference": dataset_to_param(fasta)}
		run_response = self.dataset_populator.run_tool(tool_id="pileup", inputs=inputs, history_id=history_id)
		self.dataset_populator.wait_for_job(run_response["jobs"][0]["id"], assert_ok=False)
		job_id = run_response["jobs"][0]["id"]
		job_details = self.dataset_populator.get_job_details(job_id=job_id).json()
		assert job_details["state"] == "failed"

		@skip_without_tool("cat1")
		@uses_test_history(require_new=False)
		def test_run_deferred_mapping(self, history_id: str):

test/functional/tools/for_workflows/pileup.xml

+1 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		printf 'Summary' > '$out_file1'
		]]></command>
		<inputs>
		<param name="input1" type="data" format="bam" multiple="true" min="1" label="BAM Inputs">
		<param name="input1" type="data" format="unsorted.bam" multiple="true" min="1" label="BAM Inputs">
		<validator check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue." type="metadata" />
		</param>
		<param name="reference" type="data" format="fasta" label="Fasta Input"/>