Newer
Older
{
// iso-8859-1 (case-insensitive)
if (enc_length == 10
&& (enc[0] | ' ') == 'i' && (enc[1] | ' ') == 's' && (enc[2] | ' ') == 'o'
&& enc[3] == '-' && enc[4] == '8' && enc[5] == '8' && enc[6] == '5' && enc[7] == '9'
&& enc[8] == '-' && enc[9] == '1')
return encoding_latin1;
// latin1 (case-insensitive)
if (enc_length == 6
&& (enc[0] | ' ') == 'l' && (enc[1] | ' ') == 'a' && (enc[2] | ' ') == 't'
&& (enc[3] | ' ') == 'i' && (enc[4] | ' ') == 'n'
&& enc[5] == '1')
return encoding_latin1;
}
arseny.kapoulkine
committed
return encoding_utf8;
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
{
// replace wchar encoding with utf implementation
if (encoding == encoding_wchar) return get_wchar_encoding();
// replace utf16 encoding with utf16 with specific endianness
if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
// replace utf32 encoding with utf32 with specific endianness
if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
// only do autodetection if no explicit encoding is requested
if (encoding != encoding_auto) return encoding;
// try to guess encoding (based on XML specification, Appendix F.1)
const uint8_t* data = static_cast<const uint8_t*>(contents);
return guess_buffer_encoding(data, size);
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
size_t length = size / sizeof(char_t);
if (is_mutable)
{
out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
out_length = length;
char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
if (!buffer) return false;
if (contents)
memcpy(buffer, contents, length * sizeof(char_t));
else
assert(length == 0);
buffer[length] = 0;
out_buffer = buffer;
out_length = length + 1;
}
return true;
}
#ifdef PUGIXML_WCHAR_MODE
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
{
return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
(le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
{
const char_t* data = static_cast<const char_t*>(contents);
size_t length = size / sizeof(char_t);
if (is_mutable)
{
char_t* buffer = const_cast<char_t*>(data);
convert_wchar_endian_swap(buffer, data, length);
out_buffer = buffer;
out_length = length;
char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
if (!buffer) return false;
convert_wchar_endian_swap(buffer, data, length);
buffer[length] = 0;
out_buffer = buffer;
out_length = length + 1;
}
return true;
}
template <typename D> PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D)
const typename D::type* data = static_cast<const typename D::type*>(contents);
size_t data_length = size / sizeof(typename D::type);
// first pass: get length in wchar_t units
size_t length = D::process(data, data_length, 0, wchar_counter());
// allocate buffer of suitable length
char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
if (!buffer) return false;
// second pass: convert utf16 input to wchar_t
wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
wchar_writer::value_type oend = D::process(data, data_length, obegin, wchar_writer());
assert(oend == obegin + length);
*oend = 0;
arseny.kapoulkine
committed
out_buffer = buffer;
out_length = length + 1;
arseny.kapoulkine
committed
return true;
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
{
// get native encoding
xml_encoding wchar_encoding = get_wchar_encoding();
// fast path: no conversion required
if (encoding == wchar_encoding)
return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// only endian-swapping is required
if (need_endian_swap_utf(encoding, wchar_encoding))
return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
// source encoding is utf8
if (encoding == encoding_utf8)
return convert_buffer_generic(out_buffer, out_length, contents, size, utf8_decoder());
// source encoding is utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return (native_encoding == encoding) ?
convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder<opt_false>()) :
convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder<opt_true>());
}
// source encoding is utf32
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return (native_encoding == encoding) ?
convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder<opt_false>()) :
convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder<opt_true>());
// source encoding is latin1
if (encoding == encoding_latin1)
return convert_buffer_generic(out_buffer, out_length, contents, size, latin1_decoder());
arseny.kapoulkine
committed
assert(false && "Invalid encoding"); // unreachable
return false;
}
#else
template <typename D> PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D)
const typename D::type* data = static_cast<const typename D::type*>(contents);
size_t data_length = size / sizeof(typename D::type);
// first pass: get length in utf8 units
size_t length = D::process(data, data_length, 0, utf8_counter());
// allocate buffer of suitable length
char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
if (!buffer) return false;
// second pass: convert utf16 input to utf8
uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
uint8_t* oend = D::process(data, data_length, obegin, utf8_writer());
assert(oend == obegin + length);
*oend = 0;
out_buffer = buffer;
out_length = length + 1;
return true;
}
PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
{
for (size_t i = 0; i < size; ++i)
if (data[i] > 127)
return i;
arseny.kapoulkine
committed
return size;
}
arseny.kapoulkine
committed
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
arseny.kapoulkine
committed
{
const uint8_t* data = static_cast<const uint8_t*>(contents);
size_t data_length = size;
arseny.kapoulkine
committed
// get size of prefix that does not need utf8 conversion
size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length);
assert(prefix_length <= data_length);
arseny.kapoulkine
committed
const uint8_t* postfix = data + prefix_length;
size_t postfix_length = data_length - prefix_length;
arseny.kapoulkine
committed
// if no conversion is needed, just return the original buffer
if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
arseny.kapoulkine
committed
// first pass: get length in utf8 units
size_t length = prefix_length + latin1_decoder::process(postfix, postfix_length, 0, utf8_counter());
arseny.kapoulkine
committed
// allocate buffer of suitable length
char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
if (!buffer) return false;
arseny.kapoulkine
committed
// second pass: convert latin1 input to utf8
memcpy(buffer, data, prefix_length);
uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
uint8_t* oend = latin1_decoder::process(postfix, postfix_length, obegin + prefix_length, utf8_writer());
arseny.kapoulkine
committed
assert(oend == obegin + length);
*oend = 0;
arseny.kapoulkine
committed
out_buffer = buffer;
out_length = length + 1;
arseny.kapoulkine
committed
return true;
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
{
// fast path: no conversion required
if (encoding == encoding_utf8)
return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// source encoding is utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return (native_encoding == encoding) ?
convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder<opt_false>()) :
convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder<opt_true>());
}
// source encoding is utf32
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return (native_encoding == encoding) ?
convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder<opt_false>()) :
convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder<opt_true>());
// source encoding is latin1
if (encoding == encoding_latin1)
return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
arseny.kapoulkine
committed
assert(false && "Invalid encoding"); // unreachable
return false;
}
#endif
arseny.kapoulkine@gmail.com
committed
PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
arseny.kapoulkine
committed
{
// get length in utf8 characters
return wchar_decoder::process(str, length, 0, utf8_counter());
}
PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
{
// convert to utf8
uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
uint8_t* end = wchar_decoder::process(str, length, begin, utf8_writer());
assert(begin + size == end);
(void)!end;
arseny.kapoulkine
committed
}
arseny.kapoulkine
committed
#ifndef PUGIXML_NO_STL
PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
{
arseny.kapoulkine
committed
// first pass: get length in utf8 characters
size_t size = as_utf8_begin(str, length);
arseny.kapoulkine
committed
// allocate resulting string
std::string result;
result.resize(size);
// second pass: convert to utf8
if (size > 0) as_utf8_end(&result[0], size, str, length);
return result;
}
arseny.kapoulkine
committed
arseny.kapoulkine@gmail.com
committed
PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
arseny.kapoulkine
committed
{
const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
// first pass: get length in wchar_t units
size_t length = utf8_decoder::process(data, size, 0, wchar_counter());
arseny.kapoulkine
committed
// allocate resulting string
arseny.kapoulkine
committed
std::basic_string<wchar_t> result;
arseny.kapoulkine
committed
result.resize(length);
// second pass: convert to wchar_t
if (length > 0)
{
wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
wchar_writer::value_type end = utf8_decoder::process(data, size, begin, wchar_writer());
arseny.kapoulkine
committed
assert(begin + length == end);
(void)!end;
}
return result;
}
#endif
template <typename Header>
inline bool strcpy_insitu_allow(size_t length, const Header& header, uintptr_t header_mask, char_t* target)
arseny.kapoulkine
committed
{
if (header & xml_memory_page_contents_shared_mask) return false;
arseny.kapoulkine
committed
size_t target_length = strlength(target);
arseny.kapoulkine
committed
// always reuse document buffer memory if possible
if ((header & header_mask) == 0) return target_length >= length;
arseny.kapoulkine
committed
// reuse heap memory if waste is not too great
const size_t reuse_threshold = 32;
return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
}
template <typename String, typename Header>
PUGI__FN bool strcpy_insitu(String& dest, Header& header, uintptr_t header_mask, const char_t* source, size_t source_length)
arseny.kapoulkine
committed
if (source_length == 0)
{
// empty string and null pointer are equivalent, so just deallocate old memory
xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator;
arseny.kapoulkine
committed
if (header & header_mask) alloc->deallocate_string(dest);
arseny.kapoulkine
committed
// mark the string as not allocated
dest = 0;
header &= ~header_mask;
return true;
}
else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
arseny.kapoulkine
committed
// we can reuse old buffer, so just copy the new data (including zero terminator)
memcpy(dest, source, source_length * sizeof(char_t));
dest[source_length] = 0;
return true;
}
else
{
xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator;
if (!alloc->reserve()) return false;
arseny.kapoulkine
committed
// allocate new buffer
char_t* buf = alloc->allocate_string(source_length + 1);
if (!buf) return false;
arseny.kapoulkine
committed
// copy the string (including zero terminator)
memcpy(buf, source, source_length * sizeof(char_t));
buf[source_length] = 0;
arseny.kapoulkine
committed
// deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
if (header & header_mask) alloc->deallocate_string(dest);
arseny.kapoulkine
committed
// the string is now allocated, so set the flag
dest = buf;
header |= header_mask;
return true;
}
}
struct gap
{
char_t* end;
size_t size;
gap(): end(0), size(0)
{
}
// Push new gap, move s count bytes further (skipping the gap).
// Collapse previous gap.
void push(char_t*& s, size_t count)
{
if (end) // there was a gap already; collapse it
{
// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
}
s += count; // end of current gap
// "merge" two gaps
end = s;
size += count;
}
// Collapse all gaps, return past-the-end pointer
char_t* flush(char_t* s)
{
if (end)
{
// Move [old_gap_end, current_pos) to [old_gap_start, ...)
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
return s - size;
}
else return s;
}
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
{
char_t* stre = s + 1;
switch (*stre)
{
case '#': // &#...
{
unsigned int ucsc = 0;
if (stre[1] == 'x') // &#x... (hex code)
{
stre += 2;
char_t ch = *stre;
if (ch == ';') return stre;
for (;;)
{
if (static_cast<unsigned int>(ch - '0') <= 9)
ucsc = 16 * ucsc + (ch - '0');
else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
else if (ch == ';')
break;
else // cancel
return stre;
ch = *++stre;
}
++stre;
}
else // &#... (dec code)
{
char_t ch = *++stre;
if (ch == ';') return stre;
for (;;)
{
if (static_cast<unsigned int>(ch - '0') <= 9)
ucsc = 10 * ucsc + (ch - '0');
else if (ch == ';')
break;
else // cancel
return stre;
ch = *++stre;
}
++stre;
}
#ifdef PUGIXML_WCHAR_MODE
s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
#else
s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
#endif
g.push(s, stre - s);
return stre;
}
case 'a': // &a
{
++stre;
if (*stre == 'm') // &am
{
if (*++stre == 'p' && *++stre == ';') // &
{
*s++ = '&';
++stre;
g.push(s, stre - s);
return stre;
}
}
else if (*stre == 'p') // &ap
{
if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
{
*s++ = '\'';
++stre;
g.push(s, stre - s);
return stre;
}
}
break;
}
case 'g': // &g
{
if (*++stre == 't' && *++stre == ';') // >
{
*s++ = '>';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
case 'l': // &l
{
if (*++stre == 't' && *++stre == ';') // <
{
*s++ = '<';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
case 'q': // &q
{
if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
{
*s++ = '"';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
default:
break;
return stre;
}
// Parser utilities
#define PUGI__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
#define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
#define PUGI__OPTSET(OPT) ( optmsk & (OPT) )
#define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, *alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
#define PUGI__POPNODE() { cursor = cursor->parent; }
#define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; }
#define PUGI__SCANWHILE(X) { while (X) ++s; }
#define PUGI__SCANWHILE_UNROLL(X) { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } }
#define PUGI__ENDSEG() { ch = *s; *s = 0; ++s; }
#define PUGI__THROW_ERROR(err, m) return error_offset = m, error_status = err, static_cast<char_t*>(0)
#define PUGI__CHECK_ERROR(err, m) { if (*s == 0) PUGI__THROW_ERROR(err, m); }
arseny.kapoulkine@gmail.com
committed
PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment));
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
{
*g.flush(s) = 0;
return s + (s[2] == '>' ? 3 : 2);
}
else if (*s == 0)
{
return 0;
}
else ++s;
}
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == 0)
{
return 0;
}
else ++s;
}
}
typedef char_t* (*strconv_pcdata_t)(char_t*);
template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
{
static char_t* parse(char_t* s)
{
gap g;
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata));
if (*s == '<') // PCDATA ends here
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s + 1;
}
else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (*s == 0)
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s;
}
else ++s;
}
}
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
default: assert(false); return 0; // unreachable
}
}
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
template <typename opt_escape> struct strconv_attribute_impl
{
static char_t* parse_wnorm(char_t* s, char_t end_quote)
{
gap g;
// trim leading whitespaces
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s;
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space));
g.push(s, str - s);
}
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
if (*s == end_quote)
{
char_t* str = g.flush(s);
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space));
return s + 1;
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(*s, ct_space))
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s + 1;
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
g.push(s, str - s);
}
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_wconv(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(*s, ct_space))
{
if (*s == '\r')
{
*s++ = ' ';
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
if (*s == '\n') g.push(s, 1);
}
else *s++ = ' ';
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_eol(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == '\r')
{
*s++ = '\n';
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
static char_t* parse_simple(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else ++s;
}
}
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
arseny.kapoulkine@gmail.com
committed
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
{
case 0: return strconv_attribute_impl<opt_false>::parse_simple;
case 1: return strconv_attribute_impl<opt_true>::parse_simple;
case 2: return strconv_attribute_impl<opt_false>::parse_eol;
case 3: return strconv_attribute_impl<opt_true>::parse_eol;
case 4: return strconv_attribute_impl<opt_false>::parse_wconv;
case 5: return strconv_attribute_impl<opt_true>::parse_wconv;
case 6: return strconv_attribute_impl<opt_false>::parse_wconv;
case 7: return strconv_attribute_impl<opt_true>::parse_wconv;
case 8: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 9: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
default: assert(false); return 0; // unreachable
}
}
inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
{
xml_parse_result result;
result.status = status;
result.offset = offset;
return result;
}
struct xml_parser
{
char_t* error_offset;
xml_parse_status error_status;
xml_parser(xml_allocator* alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
{
}
// DOCTYPE consists of nested sections of the following possible types:
// <!-- ... -->, <? ... ?>, "...", '...'
// <![...]]>
// <!...>
// First group can not contain nested groups
// Second group can contain nested groups of the same type
// Third group can contain all other groups
arseny.kapoulkine
committed
char_t* parse_doctype_primitive(char_t* s)
{
if (*s == '"' || *s == '\'')
{
// quoted string
char_t ch = *s++;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(*s == ch);
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
s++;
}
else if (s[0] == '<' && s[1] == '?')
{
// <? ... ?>
s += 2;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
s += 2;
}
else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
{
s += 4;
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_doctype_ignore(char_t* s)
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{
// nested ignore section
}
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{
// ignore section end
s += 3;
if (depth == 0)
return s;
depth--;
}
else s++;
}
arseny.kapoulkine@gmail.com
committed
PUGI__THROW_ERROR(status_bad_doctype, s);
char_t* parse_doctype_group(char_t* s, char_t endch)
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');