Unverified Commit 0aa5616b authored by Marius van den Beek's avatar Marius van den Beek Committed by GitHub
Browse files

Merge pull request #20820 from mvdbeek/limit_sam_metadata

[25.0] Skip sam metadata if we have too many references
parents 7bcdd373 118bf715
Loading
Loading
Loading
Loading
+30 −13
Original line number Diff line number Diff line
@@ -552,22 +552,29 @@ class _BamOrSam:
    Helper class to set the metadata common to sam and bam files
    """

    max_references = 100000

    def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
        try:
            bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb")
            with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file:
                # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
                if bam_file.nreferences <= self.max_references:
                    dataset.metadata.reference_names = list(bam_file.references)
                    dataset.metadata.reference_lengths = list(bam_file.lengths)
                    dataset.metadata.bam_header = dict(bam_file.header.items())  # type: ignore [attr-defined]
                    dataset.metadata.read_groups = [
                read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group
                        read_group["ID"]
                        for read_group in dataset.metadata.bam_header.get("RG", [])
                        if "ID" in read_group
                    ]
            dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None)
            dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None)
                else:
                    dataset.metadata.metadata_incomplete = True
                dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None)  # type: ignore [attr-defined]
                dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None)  # type: ignore [attr-defined]
        except Exception:
            # Per Dan, don't log here because doing so will cause datasets that
            # fail metadata to end in the error state
            pass
            dataset.metadata.metadata_incomplete = True


class BamNative(CompressedArchive, _BamOrSam):
@@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam):
        optional=True,
        no_value={},
    )
    MetadataElement(
        name="metadata_incomplete",
        default=False,
        desc="Indicates if metadata is incomplete",
        param=MetadataParameter,
        readonly=True,
        visible=False,
        optional=True,
        no_value=False,
    )

    def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
        _BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd)
@@ -1054,7 +1071,7 @@ class BamInputSorted(BamNative):
        """
        # The best way to ensure that BAM files are coordinate-sorted and indexable
        # is to actually index them.
        with pysam.AlignmentFile(filename=file_name) as f:
        with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
            # The only sure thing we know here is that the sort order can't be coordinate
            return f.header.get("HD", {}).get("SO") == "coordinate"  # type: ignore[attr-defined]

@@ -1074,7 +1091,7 @@ class BamQuerynameSorted(BamInputSorted):
        """
        # The best way to ensure that BAM files are coordinate-sorted and indexable
        # is to actually index them.
        with pysam.AlignmentFile(filename=file_name) as f:
        with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
            return f.header.get("HD", {}).get("SO") != "queryname"  # type: ignore[attr-defined]