Newer
Older
}
else
{
compact_set_value<header_offset>(this, value);
_data = 255;
if (_data < 255)
{
xml_memory_page* page = compact_get_page(this, header_offset);
// round-trip through void* to silence 'cast increases required alignment of target type' warnings
const uint16_t* base = reinterpret_cast<const uint16_t*>(static_cast<const void*>(reinterpret_cast<const char*>(this) - base_offset));
assert(*base);
ptrdiff_t offset = ((*base - 1) << 7) + (_data - 1);
return page->compact_string_base + offset;
}
else
{
return compact_get_value<header_offset, char_t>(this);
}
}
else
return 0;
}
private:
};
PUGI__NS_END
#endif
#ifdef PUGIXML_COMPACT
namespace pugi
{
struct xml_attribute_struct
{
xml_attribute_struct(impl::xml_memory_page* page): header(page, 0), namevalue_base(0)
PUGI__STATIC_ASSERT(sizeof(xml_attribute_struct) == 8);
}
impl::compact_header header;
uint16_t namevalue_base;
impl::compact_string<4, 2> name;
impl::compact_string<5, 3> value;
impl::compact_pointer<xml_attribute_struct, 6> prev_attribute_c;
impl::compact_pointer<xml_attribute_struct, 7, 0> next_attribute;
};
struct xml_node_struct
{
xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(page, type), namevalue_base(0)
PUGI__STATIC_ASSERT(sizeof(xml_node_struct) == 12);
}
impl::compact_header header;
uint16_t namevalue_base;
impl::compact_string<4, 2> name;
impl::compact_string<5, 3> value;
impl::compact_pointer_parent<xml_node_struct, 6> parent;
impl::compact_pointer<xml_node_struct, 8, 0> first_child;
impl::compact_pointer<xml_node_struct, 9> prev_sibling_c;
impl::compact_pointer<xml_node_struct, 10, 0> next_sibling;
impl::compact_pointer<xml_attribute_struct, 11, 0> first_attribute;
namespace pugi
{
struct xml_attribute_struct
{
xml_attribute_struct(impl::xml_memory_page* page): name(0), value(0), prev_attribute_c(0), next_attribute(0)
header = PUGI__GETHEADER_IMPL(this, page, 0);
}
uintptr_t header;
char_t* name;
char_t* value;
xml_attribute_struct* prev_attribute_c;
xml_attribute_struct* next_attribute;
};
struct xml_node_struct
{
xml_node_struct(impl::xml_memory_page* page, xml_node_type type): name(0), value(0), parent(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
header = PUGI__GETHEADER_IMPL(this, page, type);
}
uintptr_t header;
char_t* name;
char_t* value;
xml_node_struct* parent;
xml_node_struct* first_child;
xml_node_struct* prev_sibling_c;
xml_node_struct* next_sibling;
xml_attribute_struct* first_attribute;
arseny.kapoulkine@gmail.com
committed
PUGI__NS_BEGIN
arseny.kapoulkine@gmail.com
committed
struct xml_extra_buffer
{
char_t* buffer;
xml_extra_buffer* next;
};
struct xml_document_struct: public xml_node_struct, public xml_allocator
xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0)
{
}
const char_t* buffer;
arseny.kapoulkine@gmail.com
committed
xml_extra_buffer* extra_buffers;
#ifdef PUGIXML_COMPACT
compact_hash_table hash;
#endif
template <typename Object> inline xml_allocator& get_allocator(const Object* object)
return *PUGI__GETPAGE(object)->allocator;
template <typename Object> inline xml_document_struct& get_document(const Object* object)
{
assert(object);
return *static_cast<xml_document_struct*>(PUGI__GETPAGE(object)->allocator);
arseny.kapoulkine@gmail.com
committed
PUGI__NS_END
// Low-level DOM operations
arseny.kapoulkine@gmail.com
committed
PUGI__NS_BEGIN
inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
{
xml_memory_page* page;
void* memory = alloc.allocate_object(sizeof(xml_attribute_struct), page);
return new (memory) xml_attribute_struct(page);
}
inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
{
void* memory = alloc.allocate_object(sizeof(xml_node_struct), page);
return new (memory) xml_node_struct(page, type);
}
inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
{
if (a->header & impl::xml_memory_page_name_allocated_mask)
alloc.deallocate_string(a->name);
if (a->header & impl::xml_memory_page_value_allocated_mask)
alloc.deallocate_string(a->value);
alloc.deallocate_memory(a, sizeof(xml_attribute_struct), PUGI__GETPAGE(a));
}
inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
{
if (n->header & impl::xml_memory_page_name_allocated_mask)
alloc.deallocate_string(n->name);
if (n->header & impl::xml_memory_page_value_allocated_mask)
alloc.deallocate_string(n->value);
for (xml_attribute_struct* attr = n->first_attribute; attr; )
{
xml_attribute_struct* next = attr->next_attribute;
destroy_attribute(attr, alloc);
attr = next;
}
for (xml_node_struct* child = n->first_child; child; )
{
xml_node_struct* next = child->next_sibling;
destroy_node(child, alloc);
child = next;
}
alloc.deallocate_memory(n, sizeof(xml_node_struct), PUGI__GETPAGE(n));
inline void append_node(xml_node_struct* child, xml_node_struct* node)
{
child->parent = node;
xml_node_struct* head = node->first_child;
if (head)
xml_node_struct* tail = head->prev_sibling_c;
tail->next_sibling = child;
child->prev_sibling_c = tail;
head->prev_sibling_c = child;
}
else
{
node->first_child = child;
child->prev_sibling_c = child;
}
}
inline void prepend_node(xml_node_struct* child, xml_node_struct* node)
{
child->parent = node;
xml_node_struct* head = node->first_child;
if (head)
{
child->prev_sibling_c = head->prev_sibling_c;
head->prev_sibling_c = child;
}
else
child->prev_sibling_c = child;
child->next_sibling = head;
node->first_child = child;
}
inline void insert_node_after(xml_node_struct* child, xml_node_struct* node)
{
xml_node_struct* parent = node->parent;
child->parent = parent;
if (node->next_sibling)
node->next_sibling->prev_sibling_c = child;
else
parent->first_child->prev_sibling_c = child;
child->next_sibling = node->next_sibling;
child->prev_sibling_c = node;
}
inline void insert_node_before(xml_node_struct* child, xml_node_struct* node)
{
xml_node_struct* parent = node->parent;
child->parent = parent;
if (node->prev_sibling_c->next_sibling)
node->prev_sibling_c->next_sibling = child;
else
child->prev_sibling_c = node->prev_sibling_c;
child->next_sibling = node;
}
inline void remove_node(xml_node_struct* node)
{
xml_node_struct* parent = node->parent;
if (node->next_sibling)
node->next_sibling->prev_sibling_c = node->prev_sibling_c;
parent->first_child->prev_sibling_c = node->prev_sibling_c;
if (node->prev_sibling_c->next_sibling)
node->prev_sibling_c->next_sibling = node->next_sibling;
else
parent->first_child = node->next_sibling;
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
node->prev_sibling_c = 0;
node->next_sibling = 0;
}
inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node)
{
xml_attribute_struct* head = node->first_attribute;
if (head)
{
xml_attribute_struct* tail = head->prev_attribute_c;
tail->next_attribute = attr;
attr->prev_attribute_c = tail;
head->prev_attribute_c = attr;
}
else
{
node->first_attribute = attr;
attr->prev_attribute_c = attr;
}
}
inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node)
{
xml_attribute_struct* head = node->first_attribute;
if (head)
{
attr->prev_attribute_c = head->prev_attribute_c;
head->prev_attribute_c = attr;
}
else
attr->prev_attribute_c = attr;
attr->next_attribute = head;
node->first_attribute = attr;
}
inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
{
if (place->next_attribute)
place->next_attribute->prev_attribute_c = attr;
else
node->first_attribute->prev_attribute_c = attr;
attr->next_attribute = place->next_attribute;
attr->prev_attribute_c = place;
place->next_attribute = attr;
}
inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
{
if (place->prev_attribute_c->next_attribute)
place->prev_attribute_c->next_attribute = attr;
else
node->first_attribute = attr;
attr->prev_attribute_c = place->prev_attribute_c;
attr->next_attribute = place;
place->prev_attribute_c = attr;
}
inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node)
{
if (attr->next_attribute)
attr->next_attribute->prev_attribute_c = attr->prev_attribute_c;
node->first_attribute->prev_attribute_c = attr->prev_attribute_c;
if (attr->prev_attribute_c->next_attribute)
attr->prev_attribute_c->next_attribute = attr->next_attribute;
else
node->first_attribute = attr->next_attribute;
attr->prev_attribute_c = 0;
attr->next_attribute = 0;
}
PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
{
if (!alloc.reserve()) return 0;
xml_node_struct* child = allocate_node(alloc, type);
if (!child) return 0;
append_node(child, node);
return child;
}
PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc)
if (!alloc.reserve()) return 0;
xml_attribute_struct* attr = allocate_attribute(alloc);
if (!attr) return 0;
append_attribute(attr, node);
arseny.kapoulkine@gmail.com
committed
PUGI__NS_END
// Helper classes for code generation
arseny.kapoulkine@gmail.com
committed
PUGI__NS_BEGIN
struct opt_false
{
enum { value = 0 };
};
struct opt_true
{
enum { value = 1 };
};
arseny.kapoulkine@gmail.com
committed
PUGI__NS_END
// Unicode utilities
arseny.kapoulkine@gmail.com
committed
PUGI__NS_BEGIN
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
inline uint16_t endian_swap(uint16_t value)
{
return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
}
inline uint32_t endian_swap(uint32_t value)
{
return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
}
struct utf8_counter
{
typedef size_t value_type;
static value_type low(value_type result, uint32_t ch)
{
// U+0000..U+007F
if (ch < 0x80) return result + 1;
// U+0080..U+07FF
else if (ch < 0x800) return result + 2;
// U+0800..U+FFFF
else return result + 3;
}
static value_type high(value_type result, uint32_t)
{
// U+10000..U+10FFFF
return result + 4;
}
};
struct utf8_writer
{
typedef uint8_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
// U+0000..U+007F
if (ch < 0x80)
{
*result = static_cast<uint8_t>(ch);
return result + 1;
}
// U+0080..U+07FF
else if (ch < 0x800)
{
result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 2;
}
// U+0800..U+FFFF
else
{
result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 3;
}
}
static value_type high(value_type result, uint32_t ch)
{
// U+10000..U+10FFFF
result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 4;
}
static value_type any(value_type result, uint32_t ch)
{
return (ch < 0x10000) ? low(result, ch) : high(result, ch);
}
};
struct utf16_counter
{
typedef size_t value_type;
static value_type low(value_type result, uint32_t)
{
return result + 1;
}
static value_type high(value_type result, uint32_t)
{
return result + 2;
}
};
struct utf16_writer
{
typedef uint16_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
*result = static_cast<uint16_t>(ch);
return result + 1;
}
static value_type high(value_type result, uint32_t ch)
{
uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
result[0] = static_cast<uint16_t>(0xD800 + msh);
result[1] = static_cast<uint16_t>(0xDC00 + lsh);
return result + 2;
}
static value_type any(value_type result, uint32_t ch)
{
return (ch < 0x10000) ? low(result, ch) : high(result, ch);
}
};
struct utf32_counter
{
typedef size_t value_type;
static value_type low(value_type result, uint32_t)
{
return result + 1;
}
static value_type high(value_type result, uint32_t)
{
return result + 1;
}
};
struct utf32_writer
{
typedef uint32_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
*result = ch;
return result + 1;
}
static value_type high(value_type result, uint32_t ch)
{
*result = ch;
return result + 1;
}
static value_type any(value_type result, uint32_t ch)
{
*result = ch;
return result + 1;
}
};
arseny.kapoulkine
committed
struct latin1_writer
{
typedef uint8_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
*result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
return result + 1;
}
static value_type high(value_type result, uint32_t ch)
{
(void)ch;
arseny.kapoulkine
committed
*result = '?';
return result + 1;
}
};
template <typename Traits> static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits)
{
const uint8_t utf8_byte_mask = 0x3f;
while (size)
{
uint8_t lead = *data;
// 0xxxxxxx -> U+0000..U+007F
if (lead < 0x80)
{
result = Traits::low(result, lead);
data += 1;
size -= 1;
// process aligned single-byte (ascii) blocks
if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
{
// round-trip through void* to silence 'cast increases required alignment of target type' warnings
arseny.kapoulkine@gmail.com
committed
while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
{
result = Traits::low(result, data[0]);
result = Traits::low(result, data[1]);
result = Traits::low(result, data[2]);
result = Traits::low(result, data[3]);
data += 4;
size -= 4;
}
}
}
// 110xxxxx -> U+0080..U+07FF
else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
{
result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
data += 2;
size -= 2;
}
// 1110xxxx -> U+0800-U+FFFF
else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
{
result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
data += 3;
size -= 3;
}
// 11110xxx -> U+10000..U+10FFFF
else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
{
result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
data += 4;
size -= 4;
}
// 10xxxxxx or 11111xxx -> invalid
else
{
data += 1;
size -= 1;
}
}
return result;
}
};
template <typename opt_swap> struct utf16_decoder
{
typedef uint16_t type;
template <typename Traits> static inline typename Traits::value_type process(const uint16_t* data, size_t size, typename Traits::value_type result, Traits)
uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
// U+0000..U+D7FF
if (lead < 0xD800)
{
result = Traits::low(result, lead);
data += 1;
}
// U+E000..U+FFFF
else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
{
result = Traits::low(result, lead);
data += 1;
}
// surrogate pair lead
else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && size >= 2)
{
uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
{
result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
data += 2;
}
else
{
data += 1;
}
}
else
{
data += 1;
}
}
return result;
}
};
template <typename opt_swap> struct utf32_decoder
{
typedef uint32_t type;
template <typename Traits> static inline typename Traits::value_type process(const uint32_t* data, size_t size, typename Traits::value_type result, Traits)
{
uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
// U+0000..U+FFFF
if (lead < 0x10000)
{
result = Traits::low(result, lead);
data += 1;
}
// U+10000..U+10FFFF
else
{
result = Traits::high(result, lead);
data += 1;
}
}
return result;
}
};
struct latin1_decoder
{
typedef uint8_t type;
arseny.kapoulkine
committed
template <typename Traits> static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits)
arseny.kapoulkine
committed
{
result = Traits::low(result, *data);
data += 1;
size -= 1;
arseny.kapoulkine
committed
return result;
}
arseny.kapoulkine@gmail.com
committed
template <size_t size> struct wchar_selector;
arseny.kapoulkine@gmail.com
committed
template <> struct wchar_selector<2>
{
typedef uint16_t type;
typedef utf16_counter counter;
typedef utf16_writer writer;
typedef utf16_decoder<opt_false> decoder;
};
template <> struct wchar_selector<4>
{
typedef uint32_t type;
typedef utf32_counter counter;
typedef utf32_writer writer;
typedef utf32_decoder<opt_false> decoder;
};
typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
arseny.kapoulkine@gmail.com
committed
struct wchar_decoder
{
typedef wchar_t type;
template <typename Traits> static inline typename Traits::value_type process(const wchar_t* data, size_t size, typename Traits::value_type result, Traits traits)
typedef wchar_selector<sizeof(wchar_t)>::decoder decoder;
return decoder::process(reinterpret_cast<const typename decoder::type*>(data), size, result, traits);
#ifdef PUGIXML_WCHAR_MODE
arseny.kapoulkine@gmail.com
committed
PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
for (size_t i = 0; i < length; ++i)
result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
arseny.kapoulkine@gmail.com
committed
PUGI__NS_END
arseny.kapoulkine@gmail.com
committed
PUGI__NS_BEGIN
enum chartype_t
{
ct_parse_pcdata = 1, // \0, &, \r, <
ct_parse_attr = 2, // \0, &, \r, ', "
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab
ct_space = 8, // \r, \n, space, tab
ct_parse_cdata = 16, // \0, ], >, \r
ct_parse_comment = 32, // \0, -, >, \r
ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, :
};
arseny.kapoulkine@gmail.com
committed
static const unsigned char chartype_table[256] =
{
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
};
arseny.kapoulkine
committed
{
ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, "
ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _
ctx_digit = 8, // 0-9
ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
arseny.kapoulkine
committed
};
arseny.kapoulkine@gmail.com
committed
static const unsigned char chartypex_table[256] =
arseny.kapoulkine
committed
{
3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31
0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
#ifdef PUGIXML_WCHAR_MODE
arseny.kapoulkine@gmail.com
committed
#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
arseny.kapoulkine@gmail.com
committed
#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
arseny.kapoulkine@gmail.com
committed
#define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
#define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
arseny.kapoulkine
committed
arseny.kapoulkine@gmail.com
committed
PUGI__FN bool is_little_endian()
{
unsigned int ui = 1;
return *reinterpret_cast<unsigned char*>(&ui) == 1;
}
arseny.kapoulkine@gmail.com
committed
PUGI__FN xml_encoding get_wchar_encoding()
arseny.kapoulkine@gmail.com
committed
PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
if (sizeof(wchar_t) == 2)
return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
}
PUGI__FN bool parse_declaration_encoding(const uint8_t* data, size_t size, const uint8_t*& out_encoding, size_t& out_length)
arseny.kapoulkine
committed
{
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
#define PUGI__SCANCHAR(ch) { if (offset >= size || data[offset] != ch) return false; offset++; }
#define PUGI__SCANCHARTYPE(ct) { while (offset < size && PUGI__IS_CHARTYPE(data[offset], ct)) offset++; }
// check if we have a non-empty XML declaration
if (size < 6 || !((data[0] == '<') & (data[1] == '?') & (data[2] == 'x') & (data[3] == 'm') & (data[4] == 'l') && PUGI__IS_CHARTYPE(data[5], ct_space)))
return false;
// scan XML declaration until the encoding field
for (size_t i = 6; i + 1 < size; ++i)
{
// declaration can not contain ? in quoted values
if (data[i] == '?')
return false;
if (data[i] == 'e' && data[i + 1] == 'n')
{
size_t offset = i;
// encoding follows the version field which can't contain 'en' so this has to be the encoding if XML is well formed
PUGI__SCANCHAR('e'); PUGI__SCANCHAR('n'); PUGI__SCANCHAR('c'); PUGI__SCANCHAR('o');
PUGI__SCANCHAR('d'); PUGI__SCANCHAR('i'); PUGI__SCANCHAR('n'); PUGI__SCANCHAR('g');
// S? = S?
PUGI__SCANCHARTYPE(ct_space);
PUGI__SCANCHAR('=');
PUGI__SCANCHARTYPE(ct_space);
// the only two valid delimiters are ' and "
uint8_t delimiter = (offset < size && data[offset] == '"') ? '"' : '\'';
PUGI__SCANCHAR(delimiter);
size_t start = offset;
PUGI__SCANCHARTYPE(ct_symbol);
PUGI__SCANCHAR(delimiter);
return true;
}
}
return false;
#undef PUGI__SCANCHAR
#undef PUGI__SCANCHARTYPE
}
PUGI__FN xml_encoding guess_buffer_encoding(const uint8_t* data, size_t size)
{
// skip encoding autodetection if input buffer is too small
if (size < 4) return encoding_utf8;
uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
arseny.kapoulkine
committed
// look for BOM in first few bytes
if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
// look for <, <? or <?xm in various encodings
if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
// look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
// no known BOM detected; parse declaration
const uint8_t* enc = 0;
size_t enc_length = 0;
if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d && parse_declaration_encoding(data, size, enc, enc_length))