Newer
Older
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
{
*g.flush(s) = 0;
return s + (s[2] == '>' ? 3 : 2);
}
else if (*s == 0)
{
return 0;
}
else ++s;
}
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == 0)
{
return 0;
}
else ++s;
}
}
typedef char_t* (*strconv_pcdata_t)(char_t*);
template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
{
static char_t* parse(char_t* s)
{
gap g;
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata));
if (*s == '<') // PCDATA ends here
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s + 1;
}
else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (*s == 0)
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s;
}
else ++s;
}
}
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
arseny.kapoulkine@gmail.com
committed
default: assert(false); return 0; // should not get here
}
}
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
template <typename opt_escape> struct strconv_attribute_impl
{
static char_t* parse_wnorm(char_t* s, char_t end_quote)
{
gap g;
// trim leading whitespaces
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s;
do ++str;
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space));
g.push(s, str - s);
}
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
if (*s == end_quote)
{
char_t* str = g.flush(s);
do *str-- = 0;
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space));
return s + 1;
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(*s, ct_space))
{
*s++ = ' ';
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s + 1;
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
g.push(s, str - s);
}
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_wconv(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(*s, ct_space))
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
{
if (*s == '\r')
{
*s++ = ' ';
if (*s == '\n') g.push(s, 1);
}
else *s++ = ' ';
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_eol(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == '\r')
{
*s++ = '\n';
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_simple(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
arseny.kapoulkine@gmail.com
committed
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
{
case 0: return strconv_attribute_impl<opt_false>::parse_simple;
case 1: return strconv_attribute_impl<opt_true>::parse_simple;
case 2: return strconv_attribute_impl<opt_false>::parse_eol;
case 3: return strconv_attribute_impl<opt_true>::parse_eol;
case 4: return strconv_attribute_impl<opt_false>::parse_wconv;
case 5: return strconv_attribute_impl<opt_true>::parse_wconv;
case 6: return strconv_attribute_impl<opt_false>::parse_wconv;
case 7: return strconv_attribute_impl<opt_true>::parse_wconv;
case 8: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 9: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
arseny.kapoulkine@gmail.com
committed
default: assert(false); return 0; // should not get here
}
}
inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
{
xml_parse_result result;
result.status = status;
result.offset = offset;
return result;
}
struct xml_parser
{
xml_allocator alloc;
char_t* error_offset;
xml_parse_status error_status;
arseny.kapoulkine@gmail.com
committed
xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
{
}
// DOCTYPE consists of nested sections of the following possible types:
// <!-- ... -->, <? ... ?>, "...", '...'
// <![...]]>
// <!...>
// First group can not contain nested groups
// Second group can contain nested groups of the same type
// Third group can contain all other groups
arseny.kapoulkine
committed
char_t* parse_doctype_primitive(char_t* s)
{
if (*s == '"' || *s == '\'')
{
// quoted string
char_t ch = *s++;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(*s == ch);
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
s++;
}
else if (s[0] == '<' && s[1] == '?')
{
// <? ... ?>
s += 2;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
s += 2;
}
else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
{
s += 4;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_doctype_ignore(char_t* s)
{
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
s++;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{
// nested ignore section
arseny.kapoulkine
committed
s = parse_doctype_ignore(s);
if (!s) return s;
}
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{
// ignore section end
s += 3;
arseny.kapoulkine
committed
return s;
}
else s++;
}
arseny.kapoulkine@gmail.com
committed
PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
s++;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] != '-')
{
if (s[2] == '[')
{
// ignore
arseny.kapoulkine
committed
s = parse_doctype_ignore(s);
if (!s) return s;
}
else
{
// some control group
arseny.kapoulkine
committed
s = parse_doctype_group(s, endch, false);
if (!s) return s;
// skip >
assert(*s == '>');
s++;
}
}
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
{
// unknown tag (forbidden), or some primitive group
arseny.kapoulkine
committed
s = parse_doctype_primitive(s);
if (!s) return s;
}
else if (*s == '>')
{
arseny.kapoulkine
committed
return s;
}
else s++;
}
arseny.kapoulkine@gmail.com
committed
if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
{
// parse node contents, starting with exclamation mark
++s;
if (*s == '-') // '<!-...'
{
++s;
if (*s == '-') // '<!--...'
{
++s;
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_comments))
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
cursor->value = s; // Save the offset.
}
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
{
s = strconv_comment(s, endch);
arseny.kapoulkine@gmail.com
committed
if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
}
else
{
// Scan for terminating '-->'.
PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_comment, s);
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_comments))
*s = 0; // Zero-terminate this segment at the first terminating '-'.
s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
}
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_comment, s);
}
else if (*s == '[')
{
// '<![CDATA[...'
if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
{
++s;
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_cdata))
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_eol))
{
s = strconv_cdata(s, endch);
arseny.kapoulkine@gmail.com
committed
if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
}
else
{
// Scan for terminating ']]>'.
PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_cdata, s);
*s++ = 0; // Zero-terminate this segment.
}
}
else // Flagged for discard, but we still have to scan for the terminator.
{
// Scan for terminating ']]>'.
PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_cdata, s);
++s;
}
s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_cdata, s);
else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E'))
if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
char_t* mark = s + 9;
arseny.kapoulkine
committed
arseny.kapoulkine
committed
s = parse_doctype_group(s, endch, true);
if (!s) return s;
arseny.kapoulkine
committed
assert((*s == 0 && endch == '>') || *s == '>');
if (*s) *s++ = 0;
if (PUGI__OPTSET(parse_doctype))
{
while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
arseny.kapoulkine
committed
PUGI__PUSHNODE(node_doctype);
arseny.kapoulkine
committed
cursor->value = mark;
}
arseny.kapoulkine@gmail.com
committed
else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
else PUGI__THROW_ERROR(status_unrecognized_tag, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
{
// load into registers
xml_node_struct* cursor = ref_cursor;
char_t ch = 0;
// parse node contents, starting with question mark
++s;
// read PI target
char_t* target = s;
arseny.kapoulkine@gmail.com
committed
if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
arseny.kapoulkine@gmail.com
committed
PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
PUGI__CHECK_ERROR(status_bad_pi, s);
// determine node type; stricmp / strcasecmp is not portable
bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
arseny.kapoulkine@gmail.com
committed
if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
{
if (declaration)
{
// disallow non top-level declarations
arseny.kapoulkine@gmail.com
committed
if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_declaration);
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_pi);
}
cursor->name = target;
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG();
// parse value/attributes
if (ch == '?')
{
// empty node
if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
s += (*s == '>');
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(ch, ct_space))
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS();
// scan for tag end
char_t* value = s;
PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_pi, s);
if (declaration)
{
// replace ending ? with / so that 'element' terminates properly
*s = '/';
// we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
s = value;
}
else
{
// store value and step over >
cursor->value = value;
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG();
s += (*s == '>');
}
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_pi, s);
}
else
{
// scan for tag end
PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_pi, s);
s += (s[1] == '>' ? 2 : 1);
}
// store from registers
ref_cursor = cursor;
arseny.kapoulkine
committed
return s;
Arseny Kapoulkine
committed
char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
{
strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
char_t ch = 0;
arseny.kapoulkine@gmail.com
committed
xml_node_struct* cursor = root;
char_t* mark = s;
while (*s != 0)
{
if (*s == '<')
{
++s;
LOC_TAG:
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_element); // Append a new node to the tree.
cursor->name = s;
PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
if (ch == '>')
{
// end of tag
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(ch, ct_space))
{
LOC_ATTRIBUTES:
while (true)
{
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute.
arseny.kapoulkine@gmail.com
committed
if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
a->name = s; // Save the offset.
PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(ch, ct_space))
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
ch = *s;
++s;
}
if (ch == '=') // '<... #=...'
{
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
if (*s == '"' || *s == '\'') // '<... #="...'
{
ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
++s; // Step over the quote.
a->value = s; // Save the offset.
s = strconv_attribute(s, ch);
arseny.kapoulkine@gmail.com
committed
if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
// After this line the loop continues from the start;
// Whitespaces, / and > are ok, symbols and EOF are wrong,
// everything else will be detected
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_attribute, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_attribute, s);
}
else if (*s == '/')
{
++s;
if (*s == '>')
{
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
s++;
break;
}
else if (*s == 0 && endch == '>')
{
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '>')
{
++s;
break;
}
else if (*s == 0 && endch == '>')
{
break;
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
// !!!
}
else if (ch == '/') // '<#.../'
{
if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE(); // Pop.
s += (*s == '>');
}
else if (ch == 0)
{
// we stepped over null terminator, backtrack & handle closing tag
--s;
arseny.kapoulkine@gmail.com
committed
if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '/')
{
++s;
char_t* name = cursor->name;
arseny.kapoulkine@gmail.com
committed
if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*s, ct_symbol))
arseny.kapoulkine@gmail.com
committed
if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
}
if (*name)
{
arseny.kapoulkine@gmail.com
committed
if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
else PUGI__THROW_ERROR(status_end_element_mismatch, s);
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE(); // Pop.
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS();
if (*s == 0)
{
arseny.kapoulkine@gmail.com
committed
if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
arseny.kapoulkine@gmail.com
committed
if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
++s;
}
}
else if (*s == '?') // '<?...'
{
arseny.kapoulkine
committed
s = parse_question(s, cursor, optmsk, endch);
if (!s) return s;
assert(cursor);
if (PUGI__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
}
else if (*s == '!') // '<!...'
{
arseny.kapoulkine
committed
s = parse_exclamation(s, cursor, optmsk, endch);
if (!s) return s;
arseny.kapoulkine@gmail.com
committed
else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
else PUGI__THROW_ERROR(status_unrecognized_tag, s);
}
else
{
mark = s; // Save this offset while searching for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
{
// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
assert(mark != s);
if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
{
continue;
}
else if (PUGI__OPTSET(parse_ws_pcdata_single))
{
if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
}
}
if (!PUGI__OPTSET(parse_trim_pcdata))
s = mark;
if (cursor->parent || PUGI__OPTSET(parse_fragment))
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
s = strconv_pcdata(s);
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE(); // Pop since this is a standalone.
if (!*s) break;
}
else
{
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(*s == '<'); // '...<'
if (!*s) break;
++s;
}
// We're after '<'
goto LOC_TAG;
}
}
// check that last tag is closed
arseny.kapoulkine@gmail.com
committed
if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
arseny.kapoulkine@gmail.com
committed
return s;
#ifdef PUGIXML_WCHAR_MODE
static char_t* parse_skip_bom(char_t* s)
{
unsigned int bom = 0xfeff;
return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
}
#else
static char_t* parse_skip_bom(char_t* s)
{
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
}
#endif
static bool has_element_node_siblings(xml_node_struct* node)
{
while (node)
{
if (PUGI__NODETYPE(node) == node_element) return true;
node = node->next_sibling;
}
return false;
}
arseny.kapoulkine@gmail.com
committed
static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
arseny.kapoulkine@gmail.com
committed
// allocator object is a part of document object
xml_allocator& alloc_ = *static_cast<xml_allocator*>(xmldoc);
// early-out for empty documents
if (length == 0)
return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
// get last child of the root before parsing
xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
// create parser on stack
// save last character and make buffer zero-terminated (speeds up parsing)
char_t endch = buffer[length - 1];
buffer[length - 1] = 0;
// skip BOM to make sure it does not end up as part of parse output
char_t* buffer_data = parse_skip_bom(buffer);
// perform actual parsing
parser.parse_tree(buffer_data, root, optmsk, endch);
// update allocator state
arseny.kapoulkine@gmail.com
committed
xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
if (result)
{
// since we removed last character, we have to handle the only possible false positive (stray <)
if (endch == '<')
return make_parse_result(status_unrecognized_tag, length - 1);
// check if there are any element nodes parsed
xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
return make_parse_result(status_no_document_element, length - 1);
}
else
// roll back offset if it occurs on a null terminator in the source buffer
if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
result.offset--;
}
return result;
}
};
// Output facilities
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_write_native_encoding()
{
#ifdef PUGIXML_WCHAR_MODE
return get_wchar_encoding();
#else
return encoding_utf8;
#endif
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
{
// replace wchar encoding with utf implementation
if (encoding == encoding_wchar) return get_wchar_encoding();
// replace utf16 encoding with utf16 with specific endianness
if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
// replace utf32 encoding with utf32 with specific endianness
if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
// only do autodetection if no explicit encoding is requested
if (encoding != encoding_auto) return encoding;
// assume utf8 encoding
return encoding_utf8;
}
#ifdef PUGIXML_WCHAR_MODE
arseny.kapoulkine@gmail.com
committed
PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
if (length < 1) return 0;
// discard last character if it's the lead of a surrogate pair
return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
Arseny Kapoulkine
committed
PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
{
// only endian-swapping is required
if (need_endian_swap_utf(encoding, get_wchar_encoding()))
{
arseny.kapoulkine@gmail.com
committed
convert_wchar_endian_swap(r_char, data, length);
return length * sizeof(char_t);
}
// convert to utf8
if (encoding == encoding_utf8)
{
arseny.kapoulkine@gmail.com
committed
uint8_t* dest = r_u8;
arseny.kapoulkine@gmail.com
committed
uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
return static_cast<size_t>(end - dest);
}
// convert to utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
arseny.kapoulkine@gmail.com
committed
uint16_t* dest = r_u16;
// convert to native utf16
arseny.kapoulkine@gmail.com
committed
uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
// swap if necessary
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;