Unverified Commit fcb5e414 authored by John Chilton's avatar John Chilton Committed by GitHub
Browse files

Merge pull request #20805 from davelopez/add_huggingface_file_source

Add Hugging Face 🤗 file source and user-defined template
parents 49ef7aad 77c4bba0
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
import { faAws, faDropbox, faGoogleDrive } from "@fortawesome/free-brands-svg-icons";
import { faAws, faDropbox, faGoogleDrive, faHubspot } from "@fortawesome/free-brands-svg-icons";
import { faCloud, faFolderTree, faNetworkWired, type IconDefinition } from "font-awesome-6";

import type { components } from "@/api/schema";
@@ -66,6 +66,10 @@ export const templateTypes: FileSourceTypesDetail = {
        icon: faNetworkWired,
        message: "This is a repository plugin that connects with a Dataverse.org instance.",
    },
    huggingface: {
        icon: faHubspot,
        message: "This is a file repository plugin that connects with the Hugging Face Hub.",
    },
};

export const FileSourcesValidFilters = {
+4 −2
Original line number Diff line number Diff line
@@ -11184,7 +11184,8 @@ export interface components {
                | "inveniordm"
                | "zenodo"
                | "rspace"
                | "dataverse";
                | "dataverse"
                | "huggingface";
            /** Variables */
            variables?:
                | (
@@ -21585,7 +21586,8 @@ export interface components {
                | "inveniordm"
                | "zenodo"
                | "rspace"
                | "dataverse";
                | "dataverse"
                | "huggingface";
            /** Uri Root */
            uri_root: string;
            /**
+3 −0
Original line number Diff line number Diff line
@@ -333,6 +333,9 @@ class ConditionalDependencies:

        return celery_enabled and is_redis_url(celery_result_backend) or is_redis_url(celery_broker_url)

    def check_huggingface_hub(self):
        return "huggingface" in self.file_sources


def optional(config_file=None):
    if not config_file:
+1 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@ fs.onedatarestfs==21.2.5.2 # type: onedata, depends on onedatafilerestclient
fs-basespace # type: basespace
fs-azureblob # type: azure
rspace-client>=2.6.1,<3  # type: rspace
huggingface_hub

# Vault backend
hvac
+23 −10
Original line number Diff line number Diff line
@@ -141,6 +141,7 @@ class FsspecFilesSource(BaseFilesSource[FsspecTemplateConfigType, FsspecResolved
        try:
            cache_options = self._get_cache_options(context.config)
            fs = self._open_fs(context, cache_options)
            path = self._to_filesystem_path(path)

            if recursive:
                return self._list_recursive(fs, path)
@@ -182,6 +183,7 @@ class FsspecFilesSource(BaseFilesSource[FsspecTemplateConfigType, FsspecResolved
        """Download a file from the fsspec filesystem to a local path."""
        cache_options = self._get_cache_options(context.config)
        fs = self._open_fs(context, cache_options)
        source_path = self._to_filesystem_path(source_path)
        fs.get_file(source_path, native_path)

    def _write_from(
@@ -193,6 +195,7 @@ class FsspecFilesSource(BaseFilesSource[FsspecTemplateConfigType, FsspecResolved
        """Upload a file from a local path to the fsspec filesystem."""
        cache_options = self._get_cache_options(context.config)
        fs = self._open_fs(context, cache_options)
        target_path = self._to_filesystem_path(target_path)
        fs.put_file(native_path, target_path)

    def _adapt_entry_path(self, filesystem_path: str) -> str:
@@ -203,17 +206,27 @@ class FsspecFilesSource(BaseFilesSource[FsspecTemplateConfigType, FsspecResolved
        """
        return filesystem_path

    def _to_filesystem_path(self, path: str) -> str:
        """Convert an entry path to the filesystem path format.

        Subclasses can override this to transform paths (e.g., virtual to filesystem paths).
        By default, returns the path unchanged.
        """
        return path

    def _extract_timestamp(self, info: dict) -> Optional[str]:
        """Extract and format timestamp from fsspec file info."""
        # Handle timestamp fields more robustly - check for None explicitly
        mtime = info.get("mtime")
        if mtime is None:
            mtime = info.get("modified")
        if mtime is None:
            mtime = info.get("LastModified")
        """Extract the timestamp from fsspec file info to use it in the RemoteFile entry.

        Subclasses can override this to customize timestamp extraction.
        By default, it tries to extract 'mtime', 'modified', or 'LastModified'
        """
        return info.get("mtime") or info.get("modified") or info.get("LastModified")

        ctime_result = self.to_dict_time(mtime)
        return ctime_result
    def _get_formatted_timestamp(self, info: dict) -> Optional[str]:
        """Get a formatted timestamp for the RemoteFile entry."""
        mtime = self._extract_timestamp(info)
        formatted_timestamp = self.to_dict_time(mtime)
        return formatted_timestamp

    def _info_to_entry(self, info: dict) -> AnyRemoteEntry:
        """Convert fsspec file info to Galaxy's remote entry format."""
@@ -226,7 +239,7 @@ class FsspecFilesSource(BaseFilesSource[FsspecTemplateConfigType, FsspecResolved
            return RemoteDirectory(name=name, uri=uri, path=entry_path)
        else:
            size = int(info.get("size", 0))
            ctime = self._extract_timestamp(info)
            ctime = self._get_formatted_timestamp(info)
            return RemoteFile(name=name, size=size, ctime=ctime, uri=uri, path=entry_path)

    def _list_recursive(self, fs: AbstractFileSystem, path: str) -> tuple[list[AnyRemoteEntry], int]:
Loading