Unverified Commit bc962ad9 authored by Marius van den Beek's avatar Marius van den Beek Committed by GitHub
Browse files

Merge pull request #15024 from lldelisle/tabix_download_22.05

Enable Tabix download + New datatype JuicerMediumTabix in 22.05
parents b1aa2d0b 1000c67c
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -476,6 +476,10 @@
    <datatype extension="protobuf3" type="galaxy.datatypes.binary:Binary" subclass="true" description="Protocol Buffers (Protobuf) is data format for serializing structured data." display_in_upload="true"/>
    <datatype extension="onnx" type="galaxy.datatypes.binary:Binary" subclass="true" description="ONNX (Open neural network exchange) is data format for storing and sharing machine learning and deep learning models." display_in_upload="true"/>
    <datatype extension="tabix" type="galaxy.datatypes.binary:Binary" subclass="true"/>
    <datatype extension="interval_tabix.gz" type="galaxy.datatypes.interval:IntervalTabix" mimetype="application/octet-stream" />
    <datatype extension="juicer_medium_tabix.gz" type="galaxy.datatypes.interval:JuicerMediumTabix" mimetype="application/octet-stream" display_in_upload="true"/>
    <datatype extension="bed_tabix.gz" type="galaxy.datatypes.interval:BedTabix" mimetype="application/octet-stream" display_in_upload="true"/>
    <datatype extension="gff_tabix.gz" type="galaxy.datatypes.interval:GffTabix" mimetype="application/octet-stream" display_in_upload="true"/>
    <datatype extension="bgzip" type="galaxy.datatypes.binary:Binary" subclass="true" display_in_upload="true"/>
    <datatype extension="vcf_bgzip" type="galaxy.datatypes.tabular:VcfGz" display_in_upload="true">
      <display file="igv/vcf.xml"/>
+102 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import sys
import tempfile
from urllib.parse import quote_plus

import pysam
from bx.intervals.io import (
    GenomicIntervalReader,
    ParseError,
@@ -1729,6 +1730,107 @@ class ScIdx(Tabular):
        return False


class IntervalTabix(Interval):
    """
    Class describing the bgzip format (http://samtools.github.io/hts-specs/SAMv1.pdf)
    As tabix is just a bgzip sorted for chr,start,end with an index
    """

    file_ext = "interval_tabix.gz"
    edam_format = "format_3616"
    compressed = True
    compressed_format = "gzip"

    # The MetadataElements are readonly so the user cannot change them (as the index is generated only once)
    MetadataElement(name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter, readonly=True)
    MetadataElement(name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter, readonly=True)
    MetadataElement(name="endCol", default=3, desc="End column", param=metadata.ColumnParameter, readonly=True)

    # Add metadata elements
    MetadataElement(
        name="tabix_index",
        desc="Tabix Index File",
        param=metadata.FileParameter,
        file_ext="tbi",
        readonly=True,
        visible=False,
        optional=True,
    )

    # We don't want to have a good sniff as the index would be created before the metadata on columns is set.
    def sniff_prefix(self, file_prefix: FilePrefix):
        return False

    # Ideally the tabix_index would be regenerated when the metadataElements are updated
    def set_meta(self, dataset, overwrite=True, first_line_is_header=False, metadata_tmp_files_dir=None, **kwd):
        # We don't use the method Interval.set_meta as we don't want to guess the columns for chr start end
        Tabular.set_meta(self, dataset, overwrite=overwrite, skip=0)
        # Try to create the index for the Tabix file.
        # These metadata values are not accessible by users, always overwrite
        index_file = dataset.metadata.tabix_index
        if not index_file:
            index_file = dataset.metadata.spec["tabix_index"].param.new_file(
                dataset=dataset, metadata_tmp_files_dir=metadata_tmp_files_dir
            )

        try:
            # tabix_index columns are 0-based while in the command line it is 1-based
            pysam.tabix_index(
                dataset.file_name,
                index=index_file.file_name,
                seq_col=dataset.metadata.chromCol - 1,
                start_col=dataset.metadata.startCol - 1,
                end_col=dataset.metadata.endCol - 1,
                keep_original=True,
                force=True,
            )
        except Exception as e:
            raise Exception(f"Error setting tabix metadata: {util.unicodify(e)}")
        else:
            dataset.metadata.tabix_index = index_file


class JuicerMediumTabix(IntervalTabix):
    """
    Class describing a tabix file built from a juicer medium format:
    https://github.com/aidenlab/juicer/wiki/Pre#medium-format
    <readname> <str1> <chr1> <pos1> <frag1> <str2> <chr2> <pos2> <frag2> <mapq1> <mapq2>

    str = strand (0 for forward, anything else for reverse)
    chr = chromosome (must be a chromosome in the genome)
    pos = position
    frag = restriction site fragment
    mapq = mapping quality score
    """

    file_ext = "juicer_medium_tabix.gz"

    # The MetadataElements are readonly so the user cannot change them (as the index is generated only once)
    MetadataElement(name="chromCol", default=3, desc="Chrom column", param=metadata.ColumnParameter, readonly=True)
    MetadataElement(name="startCol", default=4, desc="Start column", param=metadata.ColumnParameter, readonly=True)
    MetadataElement(name="endCol", default=4, desc="End column", param=metadata.ColumnParameter, readonly=True)


class BedTabix(IntervalTabix):
    """
    Class describing a tabix file built from a bed file
    """

    file_ext = "bed_tabix.gz"


class GffTabix(IntervalTabix):
    """
    Class describing a tabix file built from a bed file
    """

    file_ext = "gff_tabix.gz"

    # The MetadataElements are readonly so the user cannot change them (as the index is generated only once)
    MetadataElement(name="startCol", default=4, desc="Start column", param=metadata.ColumnParameter, readonly=True)
    MetadataElement(name="endCol", default=5, desc="End column", param=metadata.ColumnParameter, readonly=True)


if __name__ == "__main__":
    import doctest