Newer
Older
{
if (s[0] == '<' && s[1] == '!' && s[2] != '-')
{
if (s[2] == '[')
{
// ignore
arseny.kapoulkine
committed
s = parse_doctype_ignore(s);
if (!s) return s;
}
else
{
// some control group
}
}
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
{
// unknown tag (forbidden), or some primitive group
arseny.kapoulkine
committed
s = parse_doctype_primitive(s);
if (!s) return s;
}
else if (*s == '>')
{
if (depth == 0)
return s;
depth--;
s++;
}
else s++;
}
if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
{
// parse node contents, starting with exclamation mark
++s;
if (*s == '-') // '<!-...'
{
++s;
if (*s == '-') // '<!--...'
{
++s;
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_comments))
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
cursor->value = s; // Save the offset.
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
{
s = strconv_comment(s, endch);
if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
}
else
{
// Scan for terminating '-->'.
PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_comment, s);
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_comments))
*s = 0; // Zero-terminate this segment at the first terminating '-'.
s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
}
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_comment, s);
}
else if (*s == '[')
{
// '<![CDATA[...'
if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
{
++s;
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_cdata))
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
arseny.kapoulkine@gmail.com
committed
if (PUGI__OPTSET(parse_eol))
{
s = strconv_cdata(s, endch);
if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
}
else
{
// Scan for terminating ']]>'.
PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_cdata, s);
*s++ = 0; // Zero-terminate this segment.
}
}
else // Flagged for discard, but we still have to scan for the terminator.
{
// Scan for terminating ']]>'.
PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_cdata, s);
++s;
}
s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_cdata, s);
else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E'))
if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
arseny.kapoulkine
committed
char_t* mark = s + 9;
arseny.kapoulkine
committed
s = parse_doctype_group(s, endch);
if (!s) return s;
arseny.kapoulkine
committed
assert((*s == 0 && endch == '>') || *s == '>');
if (*s) *s++ = 0;
if (PUGI__OPTSET(parse_doctype))
{
while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
arseny.kapoulkine
committed
PUGI__PUSHNODE(node_doctype);
arseny.kapoulkine
committed
arseny.kapoulkine@gmail.com
committed
else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
else PUGI__THROW_ERROR(status_unrecognized_tag, s);
arseny.kapoulkine
committed
return s;
arseny.kapoulkine
committed
char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
{
// load into registers
xml_node_struct* cursor = ref_cursor;
char_t ch = 0;
// parse node contents, starting with question mark
++s;
// read PI target
char_t* target = s;
arseny.kapoulkine@gmail.com
committed
if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
arseny.kapoulkine@gmail.com
committed
PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
PUGI__CHECK_ERROR(status_bad_pi, s);
// determine node type; stricmp / strcasecmp is not portable
bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
arseny.kapoulkine@gmail.com
committed
if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
{
if (declaration)
{
// disallow non top-level declarations
arseny.kapoulkine@gmail.com
committed
if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_declaration);
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_pi);
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG();
// parse value/attributes
if (ch == '?')
{
// empty node
if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
s += (*s == '>');
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(ch, ct_space))
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS();
// scan for tag end
char_t* value = s;
PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_pi, s);
if (declaration)
{
// replace ending ? with / so that 'element' terminates properly
*s = '/';
// we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
s = value;
}
else
{
// store value and step over >
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG();
s += (*s == '>');
}
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_pi, s);
}
else
{
// scan for tag end
PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
arseny.kapoulkine@gmail.com
committed
PUGI__CHECK_ERROR(status_bad_pi, s);
s += (s[1] == '>' ? 2 : 1);
}
// store from registers
ref_cursor = cursor;
arseny.kapoulkine
committed
return s;
Arseny Kapoulkine
committed
char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
{
strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
arseny.kapoulkine@gmail.com
committed
xml_node_struct* cursor = root;
char_t* mark = s;
while (*s != 0)
{
if (*s == '<')
{
++s;
LOC_TAG:
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
arseny.kapoulkine@gmail.com
committed
PUGI__PUSHNODE(node_element); // Append a new node to the tree.
PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
if (ch == '>')
{
// end of tag
}
arseny.kapoulkine@gmail.com
committed
else if (PUGI__IS_CHARTYPE(ch, ct_space))
{
LOC_ATTRIBUTES:
while (true)
{
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
xml_attribute_struct* a = append_new_attribute(cursor, *alloc); // Make space for this attribute.
arseny.kapoulkine@gmail.com
committed
if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
a->name = s; // Save the offset.
PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(ch, ct_space))
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
ch = *s;
++s;
}
if (ch == '=') // '<... #=...'
{
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat any whitespace.
if (*s == '"' || *s == '\'') // '<... #="...'
{
ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
++s; // Step over the quote.
a->value = s; // Save the offset.
s = strconv_attribute(s, ch);
arseny.kapoulkine@gmail.com
committed
if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
// After this line the loop continues from the start;
// Whitespaces, / and > are ok, symbols and EOF are wrong,
// everything else will be detected
arseny.kapoulkine@gmail.com
committed
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_attribute, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_attribute, s);
}
else if (*s == '/')
{
++s;
if (*s == '>')
{
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
s++;
break;
}
else if (*s == 0 && endch == '>')
{
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE();
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '>')
{
++s;
break;
}
else if (*s == 0 && endch == '>')
{
break;
}
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
// !!!
}
else if (ch == '/') // '<#.../'
{
if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE(); // Pop.
s += (*s == '>');
}
else if (ch == 0)
{
// we stepped over null terminator, backtrack & handle closing tag
--s;
arseny.kapoulkine@gmail.com
committed
if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
arseny.kapoulkine@gmail.com
committed
else PUGI__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '/')
{
++s;
mark = s;
char_t* name = cursor->name;
if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, mark);
arseny.kapoulkine@gmail.com
committed
while (PUGI__IS_CHARTYPE(*s, ct_symbol))
if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, mark);
}
if (*name)
{
arseny.kapoulkine@gmail.com
committed
if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
else PUGI__THROW_ERROR(status_end_element_mismatch, mark);
arseny.kapoulkine@gmail.com
committed
PUGI__POPNODE(); // Pop.
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS();
if (*s == 0)
{
arseny.kapoulkine@gmail.com
committed
if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
arseny.kapoulkine@gmail.com
committed
if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
++s;
}
}
else if (*s == '?') // '<?...'
{
arseny.kapoulkine
committed
s = parse_question(s, cursor, optmsk, endch);
if (!s) return s;
assert(cursor);
if (PUGI__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
}
else if (*s == '!') // '<!...'
{
arseny.kapoulkine
committed
s = parse_exclamation(s, cursor, optmsk, endch);
if (!s) return s;
arseny.kapoulkine@gmail.com
committed
else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
else PUGI__THROW_ERROR(status_unrecognized_tag, s);
}
else
{
mark = s; // Save this offset while searching for a terminator.
arseny.kapoulkine@gmail.com
committed
PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
{
// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
assert(mark != s);
if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
{
continue;
}
else if (PUGI__OPTSET(parse_ws_pcdata_single))
{
if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
}
}
if (!PUGI__OPTSET(parse_trim_pcdata))
s = mark;
if (cursor->parent || PUGI__OPTSET(parse_fragment))
if (PUGI__OPTSET(parse_embed_pcdata) && cursor->parent && !cursor->first_child && !cursor->value)
{
cursor->value = s; // Save the offset.
}
else
{
PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
PUGI__POPNODE(); // Pop since this is a standalone.
}
s = strconv_pcdata(s);
if (!*s) break;
}
else
{
arseny.kapoulkine@gmail.com
committed
PUGI__SCANFOR(*s == '<'); // '...<'
++s;
}
// We're after '<'
goto LOC_TAG;
}
}
// check that last tag is closed
arseny.kapoulkine@gmail.com
committed
if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
arseny.kapoulkine@gmail.com
committed
return s;
#ifdef PUGIXML_WCHAR_MODE
static char_t* parse_skip_bom(char_t* s)
{
unsigned int bom = 0xfeff;
return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
}
#else
static char_t* parse_skip_bom(char_t* s)
{
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
}
#endif
static bool has_element_node_siblings(xml_node_struct* node)
{
while (node)
{
if (PUGI__NODETYPE(node) == node_element) return true;
node = node->next_sibling;
}
return false;
}
arseny.kapoulkine@gmail.com
committed
static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
{
// early-out for empty documents
if (length == 0)
return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
// get last child of the root before parsing
xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 : 0;
// create parser on stack
xml_parser parser(static_cast<xml_allocator*>(xmldoc));
// save last character and make buffer zero-terminated (speeds up parsing)
char_t endch = buffer[length - 1];
buffer[length - 1] = 0;
// skip BOM to make sure it does not end up as part of parse output
char_t* buffer_data = parse_skip_bom(buffer);
// perform actual parsing
parser.parse_tree(buffer_data, root, optmsk, endch);
arseny.kapoulkine@gmail.com
committed
xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
if (result)
{
// since we removed last character, we have to handle the only possible false positive (stray <)
if (endch == '<')
return make_parse_result(status_unrecognized_tag, length - 1);
// check if there are any element nodes parsed
xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling + 0 : root->first_child+ 0;
if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
return make_parse_result(status_no_document_element, length - 1);
}
else
// roll back offset if it occurs on a null terminator in the source buffer
if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
result.offset--;
}
return result;
}
};
// Output facilities
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_write_native_encoding()
{
#ifdef PUGIXML_WCHAR_MODE
return get_wchar_encoding();
#else
return encoding_utf8;
#endif
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
{
// replace wchar encoding with utf implementation
if (encoding == encoding_wchar) return get_wchar_encoding();
// replace utf16 encoding with utf16 with specific endianness
if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
// replace utf32 encoding with utf32 with specific endianness
if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
// only do autodetection if no explicit encoding is requested
if (encoding != encoding_auto) return encoding;
// assume utf8 encoding
return encoding_utf8;
}
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
template <typename D, typename T> PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T)
{
PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type));
typename T::value_type end = D::process(reinterpret_cast<const typename D::type*>(data), length, dest, T());
return static_cast<size_t>(end - dest) * sizeof(*dest);
}
template <typename D, typename T> PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T, bool opt_swap)
{
PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type));
typename T::value_type end = D::process(reinterpret_cast<const typename D::type*>(data), length, dest, T());
if (opt_swap)
{
for (typename T::value_type i = dest; i != end; ++i)
*i = endian_swap(*i);
}
return static_cast<size_t>(end - dest) * sizeof(*dest);
}
#ifdef PUGIXML_WCHAR_MODE
arseny.kapoulkine@gmail.com
committed
PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
if (length < 1) return 0;
// discard last character if it's the lead of a surrogate pair
return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
Arseny Kapoulkine
committed
PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
{
// only endian-swapping is required
if (need_endian_swap_utf(encoding, get_wchar_encoding()))
{
arseny.kapoulkine@gmail.com
committed
convert_wchar_endian_swap(r_char, data, length);
return length * sizeof(char_t);
}
// convert to utf8
if (encoding == encoding_utf8)
return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), utf8_writer());
// convert to utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return convert_buffer_output_generic(r_u16, data, length, wchar_decoder(), utf16_writer(), native_encoding != encoding);
}
// convert to utf32
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return convert_buffer_output_generic(r_u32, data, length, wchar_decoder(), utf32_writer(), native_encoding != encoding);
arseny.kapoulkine
committed
// convert to latin1
if (encoding == encoding_latin1)
return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), latin1_writer());
arseny.kapoulkine
committed
assert(false && "Invalid encoding"); // unreachable
return 0;
}
#else
arseny.kapoulkine@gmail.com
committed
PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
if (length < 5) return 0;
for (size_t i = 1; i <= 4; ++i)
{
uint8_t ch = static_cast<uint8_t>(data[length - i]);
// either a standalone character or a leading one
if ((ch & 0xc0) != 0x80) return length - i;
}
// there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
return length;
}
Arseny Kapoulkine
committed
PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
{
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return convert_buffer_output_generic(r_u16, data, length, utf8_decoder(), utf16_writer(), native_encoding != encoding);
}
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return convert_buffer_output_generic(r_u32, data, length, utf8_decoder(), utf32_writer(), native_encoding != encoding);
arseny.kapoulkine
committed
if (encoding == encoding_latin1)
return convert_buffer_output_generic(r_u8, data, length, utf8_decoder(), latin1_writer());
arseny.kapoulkine
committed
assert(false && "Invalid encoding"); // unreachable
return 0;
}
#endif
class xml_buffered_writer
{
xml_buffered_writer(const xml_buffered_writer&);
xml_buffered_writer& operator=(const xml_buffered_writer&);
public:
xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
PUGI__STATIC_ASSERT(bufcapacity >= 8);
{
flush(buffer, bufsize);
bufsize = 0;
}
void flush(const char_t* data, size_t size)
{
if (size == 0) return;
// fast path, just write data
if (encoding == get_write_native_encoding())
writer.write(data, size * sizeof(char_t));
else
{
// convert chunk
Arseny Kapoulkine
committed
size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
assert(result <= sizeof(scratch));
// write data
arseny.kapoulkine@gmail.com
committed
writer.write(scratch.data_u8, result);
void write_direct(const char_t* data, size_t length)
// flush the remaining buffer contents
flush();
// handle large chunks
if (length > bufcapacity)
{
if (encoding == get_write_native_encoding())
// fast path, can just write data chunk
writer.write(data, length * sizeof(char_t));
return;
}
// need to convert in suitable chunks
while (length > bufcapacity)
{
// get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
// and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
size_t chunk_size = get_valid_length(data, bufcapacity);
assert(chunk_size);
// convert chunk and write
flush(data, chunk_size);
// iterate
data += chunk_size;
length -= chunk_size;
// small tail is copied below
bufsize = 0;
}
memcpy(buffer + bufsize, data, length * sizeof(char_t));
bufsize += length;
}
void write_buffer(const char_t* data, size_t length)
{
size_t offset = bufsize;
if (offset + length <= bufcapacity)
{
memcpy(buffer + offset, data, length * sizeof(char_t));
bufsize = offset + length;
}
else
{
write_direct(data, length);
}
}
void write_string(const char_t* data)
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
// write the part of the string that fits in the buffer
size_t offset = bufsize;
while (*data && offset < bufcapacity)
buffer[offset++] = *data++;
// write the rest
if (offset < bufcapacity)
{
bufsize = offset;
}
else
{
// backtrack a bit if we have split the codepoint
size_t length = offset - bufsize;
size_t extra = length - get_valid_length(data - length, length);
bufsize = offset - extra;
write_direct(data - extra, strlength(data) + extra);
}
}
void write(char_t d0)
{
size_t offset = bufsize;
if (offset > bufcapacity - 1) offset = flush();
buffer[offset + 0] = d0;
bufsize = offset + 1;
}
void write(char_t d0, char_t d1)
{
size_t offset = bufsize;
if (offset > bufcapacity - 2) offset = flush();
buffer[offset + 0] = d0;
buffer[offset + 1] = d1;
bufsize = offset + 2;
}
void write(char_t d0, char_t d1, char_t d2)
{
size_t offset = bufsize;
if (offset > bufcapacity - 3) offset = flush();
buffer[offset + 0] = d0;
buffer[offset + 1] = d1;
buffer[offset + 2] = d2;
bufsize = offset + 3;
}
void write(char_t d0, char_t d1, char_t d2, char_t d3)
{
size_t offset = bufsize;
if (offset > bufcapacity - 4) offset = flush();
buffer[offset + 0] = d0;
buffer[offset + 1] = d1;
buffer[offset + 2] = d2;
buffer[offset + 3] = d3;
bufsize = offset + 4;
}
void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
{
size_t offset = bufsize;
if (offset > bufcapacity - 5) offset = flush();
buffer[offset + 0] = d0;
buffer[offset + 1] = d1;
buffer[offset + 2] = d2;
buffer[offset + 3] = d3;
buffer[offset + 4] = d4;
bufsize = offset + 5;
}
void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
{
size_t offset = bufsize;
if (offset > bufcapacity - 6) offset = flush();
buffer[offset + 0] = d0;
buffer[offset + 1] = d1;
buffer[offset + 2] = d2;
buffer[offset + 3] = d3;
buffer[offset + 4] = d4;
buffer[offset + 5] = d5;
bufsize = offset + 6;
}
// utf8 maximum expansion: x4 (-> utf32)
// utf16 maximum expansion: x2 (-> utf32)
// utf32 maximum expansion: x1
arseny.kapoulkine@gmail.com
committed
enum
{
bufcapacitybytes =
#ifdef PUGIXML_MEMORY_OUTPUT_STACK
PUGIXML_MEMORY_OUTPUT_STACK
#else
10240
#endif
,
bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
};
char_t buffer[bufcapacity];
arseny.kapoulkine@gmail.com
committed
union
{
uint8_t data_u8[4 * bufcapacity];
uint16_t data_u16[2 * bufcapacity];
uint32_t data_u32[bufcapacity];
char_t data_char[bufcapacity];
} scratch;
xml_writer& writer;
size_t bufsize;
xml_encoding encoding;
};
arseny.kapoulkine@gmail.com
committed
PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
{
while (*s)
{
const char_t* prev = s;
// While *s is a usual symbol
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
writer.write_buffer(prev, static_cast<size_t>(s - prev));
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
switch (*s)
{
case 0: break;
case '&':
writer.write('&', 'a', 'm', 'p', ';');
++s;
break;
case '<':
writer.write('&', 'l', 't', ';');
++s;
break;
case '>':
writer.write('&', 'g', 't', ';');
++s;
break;
case '"':
writer.write('&', 'q', 'u', 'o', 't', ';');
++s;
break;
default: // s is not a usual symbol
{
unsigned int ch = static_cast<unsigned int>(*s++);
assert(ch < 32);
writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
}
}
}
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
{
if (flags & format_no_escapes)
writer.write_string(s);
else
text_output_escaped(writer, s, type);
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
{
do
{
writer.write('<', '!', '[', 'C', 'D');
writer.write('A', 'T', 'A', '[');
const char_t* prev = s;
// look for ]]> sequence - we can't output it as is since it terminates CDATA
while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
// skip ]] if we stopped at ]]>, > will go to the next CDATA section
if (*s) s += 2;
writer.write_buffer(prev, static_cast<size_t>(s - prev));
writer.write(']', ']', '>');
}
while (*s);
}
PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth)
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
switch (indent_length)
{
case 1:
{
for (unsigned int i = 0; i < depth; ++i)
writer.write(indent[0]);
break;
}
case 2:
{
for (unsigned int i = 0; i < depth; ++i)
writer.write(indent[0], indent[1]);
break;
}
case 3:
{
for (unsigned int i = 0; i < depth; ++i)
writer.write(indent[0], indent[1], indent[2]);
break;
}