Loading lib/galaxy/datatypes/binary.py +30 −13 Original line number Diff line number Diff line Loading @@ -552,22 +552,29 @@ class _BamOrSam: Helper class to set the metadata common to sam and bam files """ max_references = 100000 def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: try: bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb") with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file: # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary if bam_file.nreferences <= self.max_references: dataset.metadata.reference_names = list(bam_file.references) dataset.metadata.reference_lengths = list(bam_file.lengths) dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined] dataset.metadata.read_groups = [ read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group ] dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None) dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None) else: dataset.metadata.metadata_incomplete = True dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined] dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined] except Exception: # Per Dan, don't log here because doing so will cause datasets that # fail metadata to end in the error state pass dataset.metadata.metadata_incomplete = True class BamNative(CompressedArchive, _BamOrSam): Loading Loading @@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam): optional=True, no_value={}, ) MetadataElement( name="metadata_incomplete", default=False, desc="Indicates if metadata is incomplete", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=False, ) def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: _BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd) Loading Loading @@ -1054,7 +1071,7 @@ class BamInputSorted(BamNative): """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. with pysam.AlignmentFile(filename=file_name) as f: with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: # The only sure thing we know here is that the sort order can't be coordinate return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined] Loading @@ -1074,7 +1091,7 @@ class BamQuerynameSorted(BamInputSorted): """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. with pysam.AlignmentFile(filename=file_name) as f: with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined] Loading Loading
lib/galaxy/datatypes/binary.py +30 −13 Original line number Diff line number Diff line Loading @@ -552,22 +552,29 @@ class _BamOrSam: Helper class to set the metadata common to sam and bam files """ max_references = 100000 def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: try: bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb") with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file: # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary if bam_file.nreferences <= self.max_references: dataset.metadata.reference_names = list(bam_file.references) dataset.metadata.reference_lengths = list(bam_file.lengths) dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined] dataset.metadata.read_groups = [ read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group ] dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None) dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None) else: dataset.metadata.metadata_incomplete = True dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined] dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined] except Exception: # Per Dan, don't log here because doing so will cause datasets that # fail metadata to end in the error state pass dataset.metadata.metadata_incomplete = True class BamNative(CompressedArchive, _BamOrSam): Loading Loading @@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam): optional=True, no_value={}, ) MetadataElement( name="metadata_incomplete", default=False, desc="Indicates if metadata is incomplete", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=False, ) def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: _BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd) Loading Loading @@ -1054,7 +1071,7 @@ class BamInputSorted(BamNative): """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. with pysam.AlignmentFile(filename=file_name) as f: with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: # The only sure thing we know here is that the sort order can't be coordinate return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined] Loading @@ -1074,7 +1091,7 @@ class BamQuerynameSorted(BamInputSorted): """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. with pysam.AlignmentFile(filename=file_name) as f: with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined] Loading