Unverified Commit 7adb2a38 authored by Björn Grüning's avatar Björn Grüning Committed by GitHub
Browse files

Merge pull request #9592 from nsoranzo/release_19.09_add_paf_gfa_datatypes

[19.09] Add PAF, GFA1 and odgi datatypes
parents 07760e25 8aa55208
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
    <datatype extension="anvio_structure_db" type="galaxy.datatypes.anvio:AnvioStructureDB" display_in_upload="false" />
    <datatype extension="anvio_variability" type="galaxy.datatypes.tabular:TSV" display_in_upload="false" subclass="true" />
    <datatype extension="arff" type="galaxy.datatypes.text:Arff" mimetype="text/plain" display_in_upload="true"/>
    <datatype extension="paf" type="galaxy.datatypes.text:Paf" mimetype="text/plain" display_in_upload="true"/>
    <datatype extension="gfa1" type="galaxy.datatypes.text:Gfa1" mimetype="text/plain" display_in_upload="true"/>
    <datatype extension="asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" display_in_upload="true"/>
    <datatype extension="asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" display_in_upload="true"/>
    <datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true" description="blastz pairwise alignment format.  Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.  Blocks are separated from one another by blank lines.  The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Axt"/>
@@ -376,6 +378,7 @@
      <display file="igb/wig.xml"/>
    </datatype>
    <datatype extension="interval_index" type="galaxy.datatypes.binary:Binary" subclass="true"/>
    <datatype extension="odgi" type="galaxy.datatypes.binary:Binary" subclass="true" description="Genomic variation graphs self index used by odgi."/>
    <datatype extension="tabix" type="galaxy.datatypes.binary:Binary" subclass="true"/>
    <datatype extension="bgzip" type="galaxy.datatypes.binary:Binary" subclass="true"/>
    <datatype extension="vcf_bgzip" type="galaxy.datatypes.tabular:VcfGz" display_in_upload="true">
@@ -876,6 +879,7 @@
    <sniffer type="galaxy.datatypes.interval:Gff"/>
    <sniffer type="galaxy.datatypes.interval:Gff3"/>
    <sniffer type="galaxy.datatypes.tabular:Pileup"/>
    <sniffer type="galaxy.datatypes.text:Paf"/>
    <sniffer type="galaxy.datatypes.interval:Interval"/>
    <sniffer type="galaxy.datatypes.tabular:Sam"/>
    <sniffer type="galaxy.datatypes.data:Newick"/>
+245 −0

File added.

Preview size limit exceeded, changes collapsed.

+137 −0
Original line number Diff line number Diff line
# Downloaded from https://github.com/sjackman/gfalint/blob/master/examples/big1.gfa
H	VN:Z:1.0
S	3	*	LN:i:5376	KC:i:485841
S	5	*	LN:i:124	KC:i:6306
S	7	*	LN:i:3602	KC:i:327981
S	8	*	LN:i:52	KC:i:2218
S	9	*	LN:i:48	KC:i:154
S	10	*	LN:i:144	KC:i:4604
S	11	*	LN:i:6491	KC:i:611916
S	13	*	LN:i:10648	KC:i:1023627
S	14	*	LN:i:48	KC:i:16
S	15	*	LN:i:5930	KC:i:599587
S	17	*	LN:i:9404	KC:i:991226
S	18	*	LN:i:2206	KC:i:240150
S	19	*	LN:i:893	KC:i:82277
S	20	*	LN:i:12201	KC:i:1351219
S	21	*	LN:i:49	KC:i:300
S	22	*	LN:i:11669	KC:i:1131776
S	23	*	LN:i:7665	KC:i:766424
S	27	*	LN:i:48	KC:i:35
S	28	*	LN:i:1597	KC:i:142681
S	30	*	LN:i:48	KC:i:7874
S	31	*	LN:i:94	KC:i:805
S	32	*	LN:i:3065	KC:i:275287
S	33	*	LN:i:70	KC:i:5123
S	34	*	LN:i:49	KC:i:23
S	36	*	LN:i:12335	KC:i:1315304
S	37	*	LN:i:130	KC:i:5258
S	38	*	LN:i:4516	KC:i:440430
S	39	*	LN:i:59	KC:i:4094
S	41	*	LN:i:215	KC:i:11423
S	42	*	LN:i:146	KC:i:3161
S	43	*	LN:i:50	KC:i:563
S	44	*	LN:i:3913	KC:i:434585
S	46	*	LN:i:2710	KC:i:268203
S	47	*	LN:i:49	KC:i:200
S	49	*	LN:i:61	KC:i:3442
S	50	*	LN:i:5010	KC:i:532820
S	51	*	LN:i:9644	KC:i:950549
S	52	*	LN:i:417	KC:i:16750
S	54	*	LN:i:50	KC:i:600
S	55	*	LN:i:69	KC:i:11532
S	56	*	LN:i:1550	KC:i:141042
S	57	*	LN:i:346	KC:i:12512
S	58	*	LN:i:3825	KC:i:349334
S	59	*	LN:i:11508	KC:i:1237494
S	60	*	LN:i:720	KC:i:72448
S	61	*	LN:i:1086	KC:i:83405
S	62	*	LN:i:4818	KC:i:464490
S	63	*	LN:i:8207	KC:i:822244
S	65	*	LN:i:67	KC:i:2088
S	66	*	LN:i:4375	KC:i:421128
S	67	*	LN:i:5063	KC:i:530425
S	68	*	LN:i:78	KC:i:10057
S	69	*	LN:i:10204	KC:i:1039066
S	71	*	LN:i:789	KC:i:65350
S	72	*	LN:i:95	KC:i:6170
S	73	*	LN:i:48	KC:i:14
S	76	*	LN:i:54	KC:i:1373
S	77	*	LN:i:4167	KC:i:460259
S	78	*	LN:i:49	KC:i:925
S	79	*	LN:i:95	KC:i:5402
S	83	*	LN:i:15015	KC:i:1487985
S	84	*	LN:i:9811	KC:i:1021233
S	85	*	LN:i:3483	KC:i:365113
S	86	*	LN:i:7953	KC:i:722908
L	3	+	65	-	47M
L	3	-	76	-	47M
L	5	+	47	-	47M
L	5	-	22	-	47M
L	5	-	32	+	47M
L	7	+	49	-	47M
L	8	+	33	-	47M
L	8	+	55	-	36M
L	8	-	78	+	47M
L	8	-	79	+	47M
L	9	+	37	-	47M
L	9	+	47	+	46M
L	9	-	67	-	47M
L	9	-	69	-	47M
L	10	+	47	+	47M
L	11	-	43	-	47M
L	14	+	30	+	47M
L	14	+	31	+	47M
L	14	+	73	-	47M
L	17	+	21	-	47M
L	18	+	78	-	47M
L	19	+	76	+	47M
L	19	-	76	-	47M
L	20	-	76	+	47M
L	21	+	51	-	47M
L	21	-	32	-	47M
L	21	-	66	+	47M
L	22	+	63	+	47M
L	23	+	68	-	47M
L	27	+	30	+	47M
L	27	+	31	+	47M
L	27	+	73	-	47M
L	27	-	31	-	47M
L	27	-	83	+	47M
L	28	+	43	+	47M
L	30	+	30	+	47M
L	30	+	31	+	47M
L	30	+	73	-	47M
L	30	-	34	-	47M
L	31	-	34	-	47M
L	32	-	63	+	47M
L	33	+	72	-	47M
L	33	-	44	-	47M
L	33	-	55	-	16M
L	34	+	73	-	47M
L	36	-	63	-	47M
L	36	-	85	+	47M
L	37	-	49	+	47M
L	39	+	56	+	47M
L	39	+	67	+	47M
L	39	-	71	-	47M
L	39	-	65	+	14M
L	41	+	43	+	47M
L	42	+	65	-	47M
L	43	+	60	+	47M
L	47	+	59	+	47M
L	49	+	71	+	47M
L	49	-	84	+	47M
L	52	-	65	+	47M
L	54	+	77	-	47M
L	54	+	54	+	46M
L	54	-	58	-	47M
L	55	+	72	-	36M
L	55	-	79	-	47M
L	55	-	78	-	44M
L	57	+	63	-	47M
L	57	+	85	+	47M
L	61	+	68	+	47M
L	68	+	68	+	26M
L	72	-	78	+	47M
L	72	-	79	+	47M
+81 −0
Original line number Diff line number Diff line
@@ -730,3 +730,84 @@ class IQTree(Text):
        False
        """
        return file_prefix.startswith("IQ-TREE")


@build_sniff_from_prefix
class Paf(Text):
    """
    PAF: a Pairwise mApping Format

    https://github.com/lh3/miniasm/blob/master/PAF.md
    """
    file_ext = "paf"

    def sniff_prefix(self, file_prefix):
        """
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('A-3105.paf')
        >>> Paf().sniff(fname)
        True
        """
        found_valid_lines = False
        for line in iter_headers(file_prefix, "\t"):
            if len(line) < 12:
                return False
            for i in (1, 2, 3, 6, 7, 8, 9, 10, 11):
                int(line[i])
            if line[4] not in ('+', '-'):
                return False
            if not (0 <= int(line[11]) <= 255):
                return False
            # Check that the optional columns after the 12th contain SAM-like typed key-value pairs
            for i in range(12, len(line)):
                if len(line[i].split(':')) != 3:
                    return False
            found_valid_lines = True
        return found_valid_lines


@build_sniff_from_prefix
class Gfa1(Text):
    """
    Graphical Fragment Assembly (GFA) 1.0

    http://gfa-spec.github.io/GFA-spec/GFA1.html
    """
    file_ext = "gfa1"

    def sniff_prefix(self, file_prefix):
        """
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('big.gfa1')
        >>> Gfa1().sniff(fname)
        True
        """
        found_valid_lines = False
        for line in iter_headers(file_prefix, "\t"):
            if line[0].startswith('#'):
                continue
            if line[0] == 'H':
                return len(line) == 2 and line[1] == 'VN:Z:1.0'
            elif line[0] == 'S':
                if len(line) < 3:
                    return False
            elif line[0] == 'L':
                if len(line) < 6:
                    return False
                for i in (2, 4):
                    if line[i] not in ('+', '-'):
                        return False
            elif line[0] == 'C':
                if len(line) < 7:
                    return False
                for i in (2, 4):
                    if line[i] not in ('+', '-'):
                        return False
                int(line[5])
            elif line[0] == 'P':
                if len(line) < 4:
                    return False
            else:
                return False
            found_valid_lines = True
        return found_valid_lines