Unverified Commit d1213243 authored by Marius van den Beek's avatar Marius van den Beek Committed by GitHub
Browse files

Merge pull request #16595 from mvdbeek/download_directory_fix

[23.0] Fixes for extra files handling and cached object stores 
parents 72070dbd 99ffa0e5
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -727,18 +727,19 @@ def collect_extra_files(object_store, dataset, job_working_directory):
        # Fall back to working dir, remove in 23.2
        output_location = "working"
        temp_file_path = os.path.join(job_working_directory, output_location, file_name)
    extra_dir = None
    if not os.path.exists(temp_file_path):
        # no outputs to working directory, but may still need to push form cache to backend
        temp_file_path = dataset.extra_files_path
    try:
        # This skips creation of directories - object store
        # automatically creates them.  However, empty directories will
        # not be created in the object store at all, which might be a
        # problem.
        for root, _dirs, files in os.walk(temp_file_path):
            extra_dir = root.replace(os.path.join(job_working_directory, output_location), "", 1).lstrip(os.path.sep)
            for f in files:
                object_store.update_from_file(
                    dataset.dataset,
                    extra_dir=extra_dir,
                    extra_dir=os.path.normpath(os.path.join(file_name, os.path.relpath(root, temp_file_path))),
                    alt_name=f,
                    file_name=os.path.join(root, f),
                    create=True,
+18 −0
Original line number Diff line number Diff line
@@ -35,6 +35,24 @@ GALAXY_LIB="$galaxy_lib"
_galaxy_setup_environment "$PRESERVE_GALAXY_ENVIRONMENT"
export _GALAXY_JOB_HOME_DIR
export _GALAXY_JOB_TMP_DIR

TEMP="${TEMP:-$TMP}"
TMPDIR="${TMPDIR:-$TMP}"

TMP="${TMP:-$TEMP}"
TMPDIR="${TMPDIR:-$TEMP}"

TMP="${TMP:-$TMPDIR}"
TEMP="${TEMP:-$TMPDIR}"

TMP="${TMP:-$_GALAXY_JOB_TMP_DIR}"
TEMP="${TEMP:-$_GALAXY_JOB_TMP_DIR}"
TMPDIR="${TMPDIR:-$_GALAXY_JOB_TMP_DIR}"

export TMP
export TEMP
export TMPDIR

GALAXY_PYTHON=`command -v python`
cd $working_directory
$memory_statement
+1 −1
Original line number Diff line number Diff line
@@ -528,7 +528,7 @@ def write_job_metadata(tool_job_working_directory, job_metadata, set_meta, tool_
        new_dataset = Dataset(id=-i, external_filename=new_dataset_filename)
        extra_files = file_dict.get("extra_files", None)
        if extra_files is not None:
            new_dataset._extra_files_path = os.path.join(tool_job_working_directory, "working", extra_files)
            new_dataset._extra_files_path = os.path.join(tool_job_working_directory, "outputs", extra_files)
        new_dataset.state = new_dataset.states.OK
        new_dataset_instance = HistoryDatasetAssociation(
            id=-i, dataset=new_dataset, extension=file_dict.get("ext", "data")
+17 −1
Original line number Diff line number Diff line
@@ -47,6 +47,21 @@ log = logging.getLogger(__name__)
logging.getLogger("boto").setLevel(logging.INFO)  # Otherwise boto is quite noisy


def download_directory(bucket, remote_folder, local_path):
    # List objects in the specified S3 folder
    objects = bucket.list(prefix=remote_folder)

    for obj in objects:
        remote_file_path = obj.key
        local_file_path = os.path.join(local_path, os.path.relpath(remote_file_path, remote_folder))

        # Create directories if they don't exist
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

        # Download the file
        obj.get_contents_to_filename(local_file_path)


def parse_config_xml(config_xml):
    try:
        a_xml = config_xml.findall("auth")[0]
@@ -720,7 +735,8 @@ class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin):
            return cache_path
        # Check if the file exists in persistent storage and, if it does, pull it into cache
        elif self._exists(obj, **kwargs):
            if dir_only:  # Directories do not get pulled into cache
            if dir_only:
                download_directory(self._bucket, rel_path, cache_path)
                return cache_path
            else:
                if self._pull_into_cache(rel_path):
+5 −1
Original line number Diff line number Diff line
@@ -1174,8 +1174,10 @@ class ToolDataTableManager(Dictifiable):
        out_data: Dict[str, OutputDataset],
        bundle_description: DataTableBundleProcessorDescription,
        repo_info: Optional[RepoInfo],
    ) -> None:
    ) -> Dict[str, OutputDataset]:
        """Writes bundle and returns bundle path."""
        data_manager_dict = _data_manager_dict(out_data, ensure_single_output=True)
        bundle_datasets: Dict[str, OutputDataset] = {}
        for output_name, dataset in out_data.items():
            if dataset.ext != "data_manager_json":
                continue
@@ -1190,6 +1192,8 @@ class ToolDataTableManager(Dictifiable):
            bundle_path = os.path.join(extra_files_path, BUNDLE_INDEX_FILE_NAME)
            with open(bundle_path, "w") as fw:
                json.dump(bundle.dict(), fw)
            bundle_datasets[bundle_path] = dataset
        return bundle_datasets


SUPPORTED_DATA_TABLE_TYPES = TabularToolDataTable
Loading