Geant4 10.7.0
Toolkit for the simulation of the passage of particles through matter
Loading...
Searching...
No Matches
xmltok.cc
Go to the documentation of this file.
1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
3*/
4
5#if defined(__clang__) || defined(__GNUC__)
6#pragma GCC diagnostic ignored "-Wunused-parameter"
7#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
8#endif
9
10#include <stddef.h>
11
12#ifdef COMPILED_FROM_DSP
13#include "winconfig.h"
14#elif defined(MACOS_CLASSIC)
15#include "macconfig.h"
16#elif defined(__amigaos__)
17#include "amigaconfig.h"
18#elif defined(__WATCOMC__)
19#include "watcomconfig.h"
20#else
21#ifdef HAVE_EXPAT_CONFIG_H
22#include <expat_config.h>
23#endif
24#endif /* ndef COMPILED_FROM_DSP */
25
26#include "expat_external.h"
27#include "internal.h"
28#include "xmltok.h"
29#include "nametab.h"
30
31#ifdef XML_DTD
32#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
33#else
34#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
35#endif
36
37#define VTABLE1 \
38 { PREFIX(prologTok), PREFIX(contentTok), \
39 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
40 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
41 PREFIX(sameName), \
42 PREFIX(nameMatchesAscii), \
43 PREFIX(nameLength), \
44 PREFIX(skipS), \
45 PREFIX(getAtts), \
46 PREFIX(charRefNumber), \
47 PREFIX(predefinedEntityName), \
48 PREFIX(updatePosition), \
49 PREFIX(isPublicId)
50
51#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
52
53#define UCS2_GET_NAMING(pages, hi, lo) \
54 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
55
56/* A 2 byte UTF-8 representation splits the characters 11 bits between
57 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
58 pages, 3 bits to add to that index and 5 bits to generate the mask.
59*/
60#define UTF8_GET_NAMING2(pages, byte) \
61 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
62 + ((((byte)[0]) & 3) << 1) \
63 + ((((byte)[1]) >> 5) & 1)] \
64 & (1 << (((byte)[1]) & 0x1F)))
65
66/* A 3 byte UTF-8 representation splits the characters 16 bits between
67 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
68 into pages, 3 bits to add to that index and 5 bits to generate the
69 mask.
70*/
71#define UTF8_GET_NAMING3(pages, byte) \
72 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
73 + ((((byte)[1]) >> 2) & 0xF)] \
74 << 3) \
75 + ((((byte)[1]) & 3) << 1) \
76 + ((((byte)[2]) >> 5) & 1)] \
77 & (1 << (((byte)[2]) & 0x1F)))
78
79#define UTF8_GET_NAMING(pages, p, n) \
80 ((n) == 2 \
81 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
82 : ((n) == 3 \
83 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
84 : 0))
85
86/* Detection of invalid UTF-8 sequences is based on Table 3.1B
87 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
88 with the additional restriction of not allowing the Unicode
89 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
90 Implementation details:
91 (A & 0x80) == 0 means A < 0x80
92 and
93 (A & 0xC0) == 0xC0 means A > 0xBF
94*/
95
96#define UTF8_INVALID2(p) \
97 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
98
99#define UTF8_INVALID3(p) \
100 (((p)[2] & 0x80) == 0 \
101 || \
102 ((*p) == 0xEF && (p)[1] == 0xBF \
103 ? \
104 (p)[2] > 0xBD \
105 : \
106 ((p)[2] & 0xC0) == 0xC0) \
107 || \
108 ((*p) == 0xE0 \
109 ? \
110 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
111 : \
112 ((p)[1] & 0x80) == 0 \
113 || \
114 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
115
116#define UTF8_INVALID4(p) \
117 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
118 || \
119 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
120 || \
121 ((*p) == 0xF0 \
122 ? \
123 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
124 : \
125 ((p)[1] & 0x80) == 0 \
126 || \
127 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
128
129static int PTRFASTCALL
130isNever(const ENCODING *enc, const char *p)
131{
132 return 0;
133}
134
135static int PTRFASTCALL
136utf8_isName2(const ENCODING *enc, const char *p)
137{
138 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
139}
140
141static int PTRFASTCALL
142utf8_isName3(const ENCODING *enc, const char *p)
143{
144 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
145}
146
147#define utf8_isName4 isNever
148
149static int PTRFASTCALL
150utf8_isNmstrt2(const ENCODING *enc, const char *p)
151{
152 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
153}
154
155static int PTRFASTCALL
156utf8_isNmstrt3(const ENCODING *enc, const char *p)
157{
158 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
159}
160
161#define utf8_isNmstrt4 isNever
162
163static int PTRFASTCALL
164utf8_isInvalid2(const ENCODING *enc, const char *p)
165{
166 return UTF8_INVALID2((const unsigned char *)p);
167}
168
169static int PTRFASTCALL
170utf8_isInvalid3(const ENCODING *enc, const char *p)
171{
172 return UTF8_INVALID3((const unsigned char *)p);
173}
174
175static int PTRFASTCALL
176utf8_isInvalid4(const ENCODING *enc, const char *p)
177{
178 return UTF8_INVALID4((const unsigned char *)p);
179}
180
183 unsigned char type[256];
184#ifdef XML_MIN_SIZE
185 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
186 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
187 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
188 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
189 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
190#endif /* XML_MIN_SIZE */
191 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
192 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
193 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
194 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
195 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
196 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
197 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
198 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
199 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
200};
201
202#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
203
204#ifdef XML_MIN_SIZE
205
206#define STANDARD_VTABLE(E) \
207 E ## byteType, \
208 E ## isNameMin, \
209 E ## isNmstrtMin, \
210 E ## byteToAscii, \
211 E ## charMatches,
212
213#else
214
215#define STANDARD_VTABLE(E) /* as nothing */
216
217#endif
218
219#define NORMAL_VTABLE(E) \
220 E ## isName2, \
221 E ## isName3, \
222 E ## isName4, \
223 E ## isNmstrt2, \
224 E ## isNmstrt3, \
225 E ## isNmstrt4, \
226 E ## isInvalid2, \
227 E ## isInvalid3, \
228 E ## isInvalid4
229
230static int FASTCALL checkCharRefNumber(int);
231
232#include "xmltok_impl.h"
233#include "ascii.h"
234
235#ifdef XML_MIN_SIZE
236#define sb_isNameMin isNever
237#define sb_isNmstrtMin isNever
238#endif
239
240#ifdef XML_MIN_SIZE
241#define MINBPC(enc) ((enc)->minBytesPerChar)
242#else
243/* minimum bytes per character */
244#define MINBPC(enc) 1
245#endif
246
247#define SB_BYTE_TYPE(enc, p) \
248 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249
250#ifdef XML_MIN_SIZE
251static int PTRFASTCALL
252sb_byteType(const ENCODING *enc, const char *p)
253{
254 return SB_BYTE_TYPE(enc, p);
255}
256#define BYTE_TYPE(enc, p) \
257 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258#else
259#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260#endif
261
262#ifdef XML_MIN_SIZE
263#define BYTE_TO_ASCII(enc, p) \
264 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
265static int PTRFASTCALL
266sb_byteToAscii(const ENCODING *enc, const char *p)
267{
268 return *p;
269}
270#else
271#define BYTE_TO_ASCII(enc, p) (*(p))
272#endif
273
274#define IS_NAME_CHAR(enc, p, n) \
275 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
276#define IS_NMSTRT_CHAR(enc, p, n) \
277 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
278#define IS_INVALID_CHAR(enc, p, n) \
279 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
280
281#ifdef XML_MIN_SIZE
282#define IS_NAME_CHAR_MINBPC(enc, p) \
283 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
284#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
285 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
286#else
287#define IS_NAME_CHAR_MINBPC(enc, p) (0)
288#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
289#endif
290
291#ifdef XML_MIN_SIZE
292#define CHAR_MATCHES(enc, p, c) \
293 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
294static int PTRCALL
295sb_charMatches(const ENCODING *enc, const char *p, int c)
296{
297 return *p == c;
298}
299#else
300/* c is an ASCII character */
301#define CHAR_MATCHES(enc, p, c) (*(p) == c)
302#endif
303
304#define PREFIX(ident) normal_ ## ident
305#define XML_TOK_IMPL_C
306#include "xmltok_impl.cc"
307#undef XML_TOK_IMPL_C
308
309#undef MINBPC
310#undef BYTE_TYPE
311#undef BYTE_TO_ASCII
312#undef CHAR_MATCHES
313#undef IS_NAME_CHAR
314#undef IS_NAME_CHAR_MINBPC
315#undef IS_NMSTRT_CHAR
316#undef IS_NMSTRT_CHAR_MINBPC
317#undef IS_INVALID_CHAR
318
319enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
323 UTF8_cval4 = 0xf0
325
326static void PTRCALL
327utf8_toUtf8(const ENCODING *enc,
328 const char **fromP, const char *fromLim,
329 char **toP, const char *toLim)
330{
331 char *to;
332 const char *from;
333 if (fromLim - *fromP > toLim - *toP) {
334 /* Avoid copying partial characters. */
335 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
336 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
337 break;
338 }
339 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
340 *to = *from;
341 *fromP = from;
342 *toP = to;
343}
344
345static void PTRCALL
346utf8_toUtf16(const ENCODING *enc,
347 const char **fromP, const char *fromLim,
348 unsigned short **toP, const unsigned short *toLim)
349{
350 unsigned short *to = *toP;
351 const char *from = *fromP;
352 while (from != fromLim && to != toLim) {
353 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
354 case BT_LEAD2:
355 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
356 from += 2;
357 break;
358 case BT_LEAD3:
359 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
360 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
361 from += 3;
362 break;
363 case BT_LEAD4:
364 {
365 unsigned long n;
366 if (to + 1 == toLim)
367 goto after;
368 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
369 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
370 n -= 0x10000;
371 to[0] = (unsigned short)((n >> 10) | 0xD800);
372 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
373 to += 2;
374 from += 4;
375 }
376 break;
377 default:
378 *to++ = *from++;
379 break;
380 }
381 }
382after:
383 *fromP = from;
384 *toP = to;
385}
386
387#ifdef XML_NS
388static const struct normal_encoding utf8_encoding_ns = {
389 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
390 {
391#include "asciitab.h"
392#include "utf8tab.h"
393 },
395};
396#endif
397
398static const struct normal_encoding utf8_encoding = {
399 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
400 {
401#define BT_COLON BT_NMSTRT
402#include "asciitab.h"
403#undef BT_COLON
404#include "utf8tab.h"
405 },
407};
408
409#ifdef XML_NS
410
411static const struct normal_encoding internal_utf8_encoding_ns = {
412 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
413 {
414#include "iasciitab.h"
415#include "utf8tab.h"
416 },
418};
419
420#endif
421
422static const struct normal_encoding internal_utf8_encoding = {
423 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
424 {
425#define BT_COLON BT_NMSTRT
426#include "iasciitab.h"
427#undef BT_COLON
428#include "utf8tab.h"
429 },
431};
432
433static void PTRCALL
434latin1_toUtf8(const ENCODING *enc,
435 const char **fromP, const char *fromLim,
436 char **toP, const char *toLim)
437{
438 for (;;) {
439 unsigned char c;
440 if (*fromP == fromLim)
441 break;
442 c = (unsigned char)**fromP;
443 if (c & 0x80) {
444 if (toLim - *toP < 2)
445 break;
446 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
447 *(*toP)++ = (char)((c & 0x3f) | 0x80);
448 (*fromP)++;
449 }
450 else {
451 if (*toP == toLim)
452 break;
453 *(*toP)++ = *(*fromP)++;
454 }
455 }
456}
457
458static void PTRCALL
459latin1_toUtf16(const ENCODING *enc,
460 const char **fromP, const char *fromLim,
461 unsigned short **toP, const unsigned short *toLim)
462{
463 while (*fromP != fromLim && *toP != toLim)
464 *(*toP)++ = (unsigned char)*(*fromP)++;
465}
466
467#ifdef XML_NS
468
469static const struct normal_encoding latin1_encoding_ns = {
470 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
471 {
472#include "asciitab.h"
473#include "latin1tab.h"
474 },
475 STANDARD_VTABLE(sb_)
476};
477
478#endif
479
480static const struct normal_encoding latin1_encoding = {
481 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
482 {
483#define BT_COLON BT_NMSTRT
484#include "asciitab.h"
485#undef BT_COLON
486#include "latin1tab.h"
487 },
488 STANDARD_VTABLE(sb_)
489};
490
491static void PTRCALL
492ascii_toUtf8(const ENCODING *enc,
493 const char **fromP, const char *fromLim,
494 char **toP, const char *toLim)
495{
496 while (*fromP != fromLim && *toP != toLim)
497 *(*toP)++ = *(*fromP)++;
498}
499
500#ifdef XML_NS
501
502static const struct normal_encoding ascii_encoding_ns = {
503 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
504 {
505#include "asciitab.h"
506/* BT_NONXML == 0 */
507 },
508 STANDARD_VTABLE(sb_)
509};
510
511#endif
512
513static const struct normal_encoding ascii_encoding = {
514 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
515 {
516#define BT_COLON BT_NMSTRT
517#include "asciitab.h"
518#undef BT_COLON
519/* BT_NONXML == 0 */
520 },
521 STANDARD_VTABLE(sb_)
522};
523
524static int PTRFASTCALL
525unicode_byte_type(char hi, char lo)
526{
527 switch ((unsigned char)hi) {
528 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
529 return BT_LEAD4;
530 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
531 return BT_TRAIL;
532 case 0xFF:
533 switch ((unsigned char)lo) {
534 case 0xFF:
535 case 0xFE:
536 return BT_NONXML;
537 }
538 break;
539 }
540 return BT_NONASCII;
541}
542
543#define DEFINE_UTF16_TO_UTF8(E) \
544static void PTRCALL \
545E ## toUtf8(const ENCODING *enc, \
546 const char **fromP, const char *fromLim, \
547 char **toP, const char *toLim) \
548{ \
549 const char *from; \
550 for (from = *fromP; from != fromLim; from += 2) { \
551 int plane; \
552 unsigned char lo2; \
553 unsigned char lo = GET_LO(from); \
554 unsigned char hi = GET_HI(from); \
555 switch (hi) { \
556 case 0: \
557 if (lo < 0x80) { \
558 if (*toP == toLim) { \
559 *fromP = from; \
560 return; \
561 } \
562 *(*toP)++ = lo; \
563 break; \
564 } \
565 /* fall through */ \
566 case 0x1: case 0x2: case 0x3: \
567 case 0x4: case 0x5: case 0x6: case 0x7: \
568 if (toLim - *toP < 2) { \
569 *fromP = from; \
570 return; \
571 } \
572 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
573 *(*toP)++ = ((lo & 0x3f) | 0x80); \
574 break; \
575 default: \
576 if (toLim - *toP < 3) { \
577 *fromP = from; \
578 return; \
579 } \
580 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
581 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
582 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
583 *(*toP)++ = ((lo & 0x3f) | 0x80); \
584 break; \
585 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
586 if (toLim - *toP < 4) { \
587 *fromP = from; \
588 return; \
589 } \
590 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
591 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
592 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
593 from += 2; \
594 lo2 = GET_LO(from); \
595 *(*toP)++ = (((lo & 0x3) << 4) \
596 | ((GET_HI(from) & 0x3) << 2) \
597 | (lo2 >> 6) \
598 | 0x80); \
599 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
600 break; \
601 } \
602 } \
603 *fromP = from; \
604}
605
606#define DEFINE_UTF16_TO_UTF16(E) \
607static void PTRCALL \
608E ## toUtf16(const ENCODING *enc, \
609 const char **fromP, const char *fromLim, \
610 unsigned short **toP, const unsigned short *toLim) \
611{ \
612 /* Avoid copying first half only of surrogate */ \
613 if (fromLim - *fromP > ((toLim - *toP) << 1) \
614 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
615 fromLim -= 2; \
616 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
617 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
618}
619
620#define SET2(ptr, ch) \
621 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
622#define GET_LO(ptr) ((unsigned char)(ptr)[0])
623#define GET_HI(ptr) ((unsigned char)(ptr)[1])
624
625DEFINE_UTF16_TO_UTF8(little2_)
626DEFINE_UTF16_TO_UTF16(little2_)
627
628#undef SET2
629#undef GET_LO
630#undef GET_HI
631
632#define SET2(ptr, ch) \
633 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
634#define GET_LO(ptr) ((unsigned char)(ptr)[1])
635#define GET_HI(ptr) ((unsigned char)(ptr)[0])
636
639
640#undef SET2
641#undef GET_LO
642#undef GET_HI
643
644#define LITTLE2_BYTE_TYPE(enc, p) \
645 ((p)[1] == 0 \
646 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
647 : unicode_byte_type((p)[1], (p)[0]))
648#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
649#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
650#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
651 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
652#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
653 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
654
655#ifdef XML_MIN_SIZE
656
657static int PTRFASTCALL
658little2_byteType(const ENCODING *enc, const char *p)
659{
660 return LITTLE2_BYTE_TYPE(enc, p);
661}
662
663static int PTRFASTCALL
664little2_byteToAscii(const ENCODING *enc, const char *p)
665{
666 return LITTLE2_BYTE_TO_ASCII(enc, p);
667}
668
669static int PTRCALL
670little2_charMatches(const ENCODING *enc, const char *p, int c)
671{
672 return LITTLE2_CHAR_MATCHES(enc, p, c);
673}
674
675static int PTRFASTCALL
676little2_isNameMin(const ENCODING *enc, const char *p)
677{
679}
680
681static int PTRFASTCALL
682little2_isNmstrtMin(const ENCODING *enc, const char *p)
683{
685}
686
687#undef VTABLE
688#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
689
690#else /* not XML_MIN_SIZE */
691
692#undef PREFIX
693#define PREFIX(ident) little2_ ## ident
694#define MINBPC(enc) 2
695/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
696#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
697#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
698#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
699#define IS_NAME_CHAR(enc, p, n) 0
700#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
701#define IS_NMSTRT_CHAR(enc, p, n) (0)
702#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
703
704#define XML_TOK_IMPL_C
705#include "xmltok_impl.cc"
706#undef XML_TOK_IMPL_C
707
708#undef MINBPC
709#undef BYTE_TYPE
710#undef BYTE_TO_ASCII
711#undef CHAR_MATCHES
712#undef IS_NAME_CHAR
713#undef IS_NAME_CHAR_MINBPC
714#undef IS_NMSTRT_CHAR
715#undef IS_NMSTRT_CHAR_MINBPC
716#undef IS_INVALID_CHAR
717
718#endif /* not XML_MIN_SIZE */
719
720#ifdef XML_NS
721
722static const struct normal_encoding little2_encoding_ns = {
723 { VTABLE, 2, 0,
724#if BYTEORDER == 1234
725 1
726#else
727 0
728#endif
729 },
730 {
731#include "asciitab.h"
732#include "latin1tab.h"
733 },
734 STANDARD_VTABLE(little2_)
735};
736
737#endif
738
739static const struct normal_encoding little2_encoding = {
740 { VTABLE, 2, 0,
741#if BYTEORDER == 1234
742 1
743#else
744 0
745#endif
746 },
747 {
748#define BT_COLON BT_NMSTRT
749#include "asciitab.h"
750#undef BT_COLON
751#include "latin1tab.h"
752 },
753 STANDARD_VTABLE(little2_)
754};
755
756#if BYTEORDER != 4321
757
758#ifdef XML_NS
759
760static const struct normal_encoding internal_little2_encoding_ns = {
761 { VTABLE, 2, 0, 1 },
762 {
763#include "iasciitab.h"
764#include "latin1tab.h"
765 },
766 STANDARD_VTABLE(little2_)
767};
768
769#endif
770
771static const struct normal_encoding internal_little2_encoding = {
772 { VTABLE, 2, 0, 1 },
773 {
774#define BT_COLON BT_NMSTRT
775#include "iasciitab.h"
776#undef BT_COLON
777#include "latin1tab.h"
778 },
779 STANDARD_VTABLE(little2_)
780};
781
782#endif
783
784
785#define BIG2_BYTE_TYPE(enc, p) \
786 ((p)[0] == 0 \
787 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
788 : unicode_byte_type((p)[0], (p)[1]))
789#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
790#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
791#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
792 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
793#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
794 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
795
796#ifdef XML_MIN_SIZE
797
798static int PTRFASTCALL
799big2_byteType(const ENCODING *enc, const char *p)
800{
801 return BIG2_BYTE_TYPE(enc, p);
802}
803
804static int PTRFASTCALL
805big2_byteToAscii(const ENCODING *enc, const char *p)
806{
807 return BIG2_BYTE_TO_ASCII(enc, p);
808}
809
810static int PTRCALL
811big2_charMatches(const ENCODING *enc, const char *p, int c)
812{
813 return BIG2_CHAR_MATCHES(enc, p, c);
814}
815
816static int PTRFASTCALL
817big2_isNameMin(const ENCODING *enc, const char *p)
818{
819 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
820}
821
822static int PTRFASTCALL
823big2_isNmstrtMin(const ENCODING *enc, const char *p)
824{
826}
827
828#undef VTABLE
829#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
830
831#else /* not XML_MIN_SIZE */
832
833#undef PREFIX
834#define PREFIX(ident) big2_ ## ident
835#define MINBPC(enc) 2
836/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
837#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
838#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
839#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
840#define IS_NAME_CHAR(enc, p, n) 0
841#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
842#define IS_NMSTRT_CHAR(enc, p, n) (0)
843#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
844
845#define XML_TOK_IMPL_C
846#include "xmltok_impl.cc"
847#undef XML_TOK_IMPL_C
848
849#undef MINBPC
850#undef BYTE_TYPE
851#undef BYTE_TO_ASCII
852#undef CHAR_MATCHES
853#undef IS_NAME_CHAR
854#undef IS_NAME_CHAR_MINBPC
855#undef IS_NMSTRT_CHAR
856#undef IS_NMSTRT_CHAR_MINBPC
857#undef IS_INVALID_CHAR
858
859#endif /* not XML_MIN_SIZE */
860
861#ifdef XML_NS
862
863static const struct normal_encoding big2_encoding_ns = {
864 { VTABLE, 2, 0,
865#if BYTEORDER == 4321
866 1
867#else
868 0
869#endif
870 },
871 {
872#include "asciitab.h"
873#include "latin1tab.h"
874 },
875 STANDARD_VTABLE(big2_)
876};
877
878#endif
879
880static const struct normal_encoding big2_encoding = {
881 { VTABLE, 2, 0,
882#if BYTEORDER == 4321
883 1
884#else
885 0
886#endif
887 },
888 {
889#define BT_COLON BT_NMSTRT
890#include "asciitab.h"
891#undef BT_COLON
892#include "latin1tab.h"
893 },
894 STANDARD_VTABLE(big2_)
895};
896
897#if BYTEORDER != 1234
898
899#ifdef XML_NS
900
901static const struct normal_encoding internal_big2_encoding_ns = {
902 { VTABLE, 2, 0, 1 },
903 {
904#include "iasciitab.h"
905#include "latin1tab.h"
906 },
907 STANDARD_VTABLE(big2_)
908};
909
910#endif
911
912static const struct normal_encoding internal_big2_encoding = {
913 { VTABLE, 2, 0, 1 },
914 {
915#define BT_COLON BT_NMSTRT
916#include "iasciitab.h"
917#undef BT_COLON
918#include "latin1tab.h"
919 },
920 STANDARD_VTABLE(big2_)
921};
922
923#endif
924
925#undef PREFIX
926
927static int FASTCALL
928streqci(const char *s1, const char *s2)
929{
930 for (;;) {
931 char c1 = *s1++;
932 char c2 = *s2++;
933 if (ASCII_a <= c1 && c1 <= ASCII_z)
934 c1 += ASCII_A - ASCII_a;
935 if (ASCII_a <= c2 && c2 <= ASCII_z)
936 c2 += ASCII_A - ASCII_a;
937 if (c1 != c2)
938 return 0;
939 if (!c1)
940 break;
941 }
942 return 1;
943}
944
945static void PTRCALL
946initUpdatePosition(const ENCODING *enc, const char *ptr,
947 const char *end, POSITION *pos)
948{
949 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
950}
951
952static int
953toAscii(const ENCODING *enc, const char *ptr, const char *end)
954{
955 char buf[1];
956 char *p = buf;
957 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
958 if (p == buf)
959 return -1;
960 else
961 return buf[0];
962}
963
964static int FASTCALL
965isSpace(int c)
966{
967 switch (c) {
968 case 0x20:
969 case 0xD:
970 case 0xA:
971 case 0x9:
972 return 1;
973 }
974 return 0;
975}
976
977/* Return 1 if there's just optional white space or there's an S
978 followed by name=val.
979*/
980static int
981parsePseudoAttribute(const ENCODING *enc,
982 const char *ptr,
983 const char *end,
984 const char **namePtr,
985 const char **nameEndPtr,
986 const char **valPtr,
987 const char **nextTokPtr)
988{
989 int c;
990 char open;
991 if (ptr == end) {
992 *namePtr = NULL;
993 return 1;
994 }
995 if (!isSpace(toAscii(enc, ptr, end))) {
996 *nextTokPtr = ptr;
997 return 0;
998 }
999 do {
1000 ptr += enc->minBytesPerChar;
1001 } while (isSpace(toAscii(enc, ptr, end)));
1002 if (ptr == end) {
1003 *namePtr = NULL;
1004 return 1;
1005 }
1006 *namePtr = ptr;
1007 for (;;) {
1008 c = toAscii(enc, ptr, end);
1009 if (c == -1) {
1010 *nextTokPtr = ptr;
1011 return 0;
1012 }
1013 if (c == ASCII_EQUALS) {
1014 *nameEndPtr = ptr;
1015 break;
1016 }
1017 if (isSpace(c)) {
1018 *nameEndPtr = ptr;
1019 do {
1020 ptr += enc->minBytesPerChar;
1021 } while (isSpace(c = toAscii(enc, ptr, end)));
1022 if (c != ASCII_EQUALS) {
1023 *nextTokPtr = ptr;
1024 return 0;
1025 }
1026 break;
1027 }
1028 ptr += enc->minBytesPerChar;
1029 }
1030 if (ptr == *namePtr) {
1031 *nextTokPtr = ptr;
1032 return 0;
1033 }
1034 ptr += enc->minBytesPerChar;
1035 c = toAscii(enc, ptr, end);
1036 while (isSpace(c)) {
1037 ptr += enc->minBytesPerChar;
1038 c = toAscii(enc, ptr, end);
1039 }
1040 if (c != ASCII_QUOT && c != ASCII_APOS) {
1041 *nextTokPtr = ptr;
1042 return 0;
1043 }
1044 open = (char)c;
1045 ptr += enc->minBytesPerChar;
1046 *valPtr = ptr;
1047 for (;; ptr += enc->minBytesPerChar) {
1048 c = toAscii(enc, ptr, end);
1049 if (c == open)
1050 break;
1051 if (!(ASCII_a <= c && c <= ASCII_z)
1052 && !(ASCII_A <= c && c <= ASCII_Z)
1053 && !(ASCII_0 <= c && c <= ASCII_9)
1054 && c != ASCII_PERIOD
1055 && c != ASCII_MINUS
1056 && c != ASCII_UNDERSCORE) {
1057 *nextTokPtr = ptr;
1058 return 0;
1059 }
1060 }
1061 *nextTokPtr = ptr + enc->minBytesPerChar;
1062 return 1;
1063}
1064
1065static const char KW_version[] = {
1067};
1068
1069static const char KW_encoding[] = {
1071};
1072
1073static const char KW_standalone[] = {
1075 ASCII_n, ASCII_e, '\0'
1076};
1077
1078static const char KW_yes[] = {
1079 ASCII_y, ASCII_e, ASCII_s, '\0'
1080};
1081
1082static const char KW_no[] = {
1083 ASCII_n, ASCII_o, '\0'
1084};
1085
1086static int
1087doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1088 const char *,
1089 const char *),
1090 int isGeneralTextEntity,
1091 const ENCODING *enc,
1092 const char *ptr,
1093 const char *end,
1094 const char **badPtr,
1095 const char **versionPtr,
1096 const char **versionEndPtr,
1097 const char **encodingName,
1098 const ENCODING **encoding,
1099 int *standalone)
1100{
1101 const char *val = NULL;
1102 const char *name = NULL;
1103 const char *nameEnd = NULL;
1104 ptr += 5 * enc->minBytesPerChar;
1105 end -= 2 * enc->minBytesPerChar;
1106 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1107 || !name) {
1108 *badPtr = ptr;
1109 return 0;
1110 }
1111 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1112 if (!isGeneralTextEntity) {
1113 *badPtr = name;
1114 return 0;
1115 }
1116 }
1117 else {
1118 if (versionPtr)
1119 *versionPtr = val;
1120 if (versionEndPtr)
1121 *versionEndPtr = ptr;
1122 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1123 *badPtr = ptr;
1124 return 0;
1125 }
1126 if (!name) {
1127 if (isGeneralTextEntity) {
1128 /* a TextDecl must have an EncodingDecl */
1129 *badPtr = ptr;
1130 return 0;
1131 }
1132 return 1;
1133 }
1134 }
1135 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1136 int c = toAscii(enc, val, end);
1137 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1138 *badPtr = val;
1139 return 0;
1140 }
1141 if (encodingName)
1142 *encodingName = val;
1143 if (encoding)
1144 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1146 *badPtr = ptr;
1147 return 0;
1148 }
1149 if (!name)
1150 return 1;
1151 }
1152 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1153 || isGeneralTextEntity) {
1154 *badPtr = name;
1155 return 0;
1156 }
1157 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1158 if (standalone)
1159 *standalone = 1;
1160 }
1161 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1162 if (standalone)
1163 *standalone = 0;
1164 }
1165 else {
1166 *badPtr = val;
1167 return 0;
1168 }
1169 while (isSpace(toAscii(enc, ptr, end)))
1170 ptr += enc->minBytesPerChar;
1171 if (ptr != end) {
1172 *badPtr = ptr;
1173 return 0;
1174 }
1175 return 1;
1176}
1177
1178static int FASTCALL
1179checkCharRefNumber(int result)
1180{
1181 switch (result >> 8) {
1182 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1183 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1184 return -1;
1185 case 0:
1186 if (latin1_encoding.type[result] == BT_NONXML)
1187 return -1;
1188 break;
1189 case 0xFF:
1190 if (result == 0xFFFE || result == 0xFFFF)
1191 return -1;
1192 break;
1193 }
1194 return result;
1195}
1196
1197int FASTCALL
1198XmlUtf8Encode(int c, char *buf)
1199{
1200 enum {
1201 /* minN is minimum legal resulting value for N byte sequence */
1202 min2 = 0x80,
1203 min3 = 0x800,
1204 min4 = 0x10000
1205 };
1206
1207 if (c < 0)
1208 return 0;
1209 if (c < min2) {
1210 buf[0] = (char)(c | UTF8_cval1);
1211 return 1;
1212 }
1213 if (c < min3) {
1214 buf[0] = (char)((c >> 6) | UTF8_cval2);
1215 buf[1] = (char)((c & 0x3f) | 0x80);
1216 return 2;
1217 }
1218 if (c < min4) {
1219 buf[0] = (char)((c >> 12) | UTF8_cval3);
1220 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1221 buf[2] = (char)((c & 0x3f) | 0x80);
1222 return 3;
1223 }
1224 if (c < 0x110000) {
1225 buf[0] = (char)((c >> 18) | UTF8_cval4);
1226 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1227 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1228 buf[3] = (char)((c & 0x3f) | 0x80);
1229 return 4;
1230 }
1231 return 0;
1232}
1233
1234int FASTCALL
1235XmlUtf16Encode(int charNum, unsigned short *buf)
1236{
1237 if (charNum < 0)
1238 return 0;
1239 if (charNum < 0x10000) {
1240 buf[0] = (unsigned short)charNum;
1241 return 1;
1242 }
1243 if (charNum < 0x110000) {
1244 charNum -= 0x10000;
1245 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1246 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1247 return 2;
1248 }
1249 return 0;
1250}
1251
1256 unsigned short utf16[256];
1257 char utf8[256][4];
1258};
1259
1260#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1261
1262int
1264{
1265 return sizeof(struct unknown_encoding);
1266}
1267
1268static int PTRFASTCALL
1269unknown_isName(const ENCODING *enc, const char *p)
1270{
1271 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1272 int c = uenc->convert(uenc->userData, p);
1273 if (c & ~0xFFFF)
1274 return 0;
1275 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1276}
1277
1278static int PTRFASTCALL
1279unknown_isNmstrt(const ENCODING *enc, const char *p)
1280{
1281 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1282 int c = uenc->convert(uenc->userData, p);
1283 if (c & ~0xFFFF)
1284 return 0;
1285 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1286}
1287
1288static int PTRFASTCALL
1289unknown_isInvalid(const ENCODING *enc, const char *p)
1290{
1291 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1292 int c = uenc->convert(uenc->userData, p);
1293 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1294}
1295
1296static void PTRCALL
1297unknown_toUtf8(const ENCODING *enc,
1298 const char **fromP, const char *fromLim,
1299 char **toP, const char *toLim)
1300{
1301 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1302 char buf[XML_UTF8_ENCODE_MAX];
1303 for (;;) {
1304 const char *utf8;
1305 int n;
1306 if (*fromP == fromLim)
1307 break;
1308 utf8 = uenc->utf8[(unsigned char)**fromP];
1309 n = *utf8++;
1310 if (n == 0) {
1311 int c = uenc->convert(uenc->userData, *fromP);
1312 n = XmlUtf8Encode(c, buf);
1313 if (n > toLim - *toP)
1314 break;
1315 utf8 = buf;
1316 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1317 - (BT_LEAD2 - 2));
1318 }
1319 else {
1320 if (n > toLim - *toP)
1321 break;
1322 (*fromP)++;
1323 }
1324 do {
1325 *(*toP)++ = *utf8++;
1326 } while (--n != 0);
1327 }
1328}
1329
1330static void PTRCALL
1331unknown_toUtf16(const ENCODING *enc,
1332 const char **fromP, const char *fromLim,
1333 unsigned short **toP, const unsigned short *toLim)
1334{
1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336 while (*fromP != fromLim && *toP != toLim) {
1337 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1338 if (c == 0) {
1339 c = (unsigned short)
1340 uenc->convert(uenc->userData, *fromP);
1341 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342 - (BT_LEAD2 - 2));
1343 }
1344 else
1345 (*fromP)++;
1346 *(*toP)++ = c;
1347 }
1348}
1349
1350ENCODING *
1352 int *table,
1354 void *userData)
1355{
1356 int i;
1357 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1358 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1359 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1360 for (i = 0; i < 128; i++)
1361 if (latin1_encoding.type[i] != BT_OTHER
1362 && latin1_encoding.type[i] != BT_NONXML
1363 && table[i] != i)
1364 return 0;
1365 for (i = 0; i < 256; i++) {
1366 int c = table[i];
1367 if (c == -1) {
1368 e->normal.type[i] = BT_MALFORM;
1369 /* This shouldn't really get used. */
1370 e->utf16[i] = 0xFFFF;
1371 e->utf8[i][0] = 1;
1372 e->utf8[i][1] = 0;
1373 }
1374 else if (c < 0) {
1375 if (c < -4)
1376 return 0;
1377 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1378 e->utf8[i][0] = 0;
1379 e->utf16[i] = 0;
1380 }
1381 else if (c < 0x80) {
1382 if (latin1_encoding.type[c] != BT_OTHER
1383 && latin1_encoding.type[c] != BT_NONXML
1384 && c != i)
1385 return 0;
1386 e->normal.type[i] = latin1_encoding.type[c];
1387 e->utf8[i][0] = 1;
1388 e->utf8[i][1] = (char)c;
1389 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1390 }
1391 else if (checkCharRefNumber(c) < 0) {
1392 e->normal.type[i] = BT_NONXML;
1393 /* This shouldn't really get used. */
1394 e->utf16[i] = 0xFFFF;
1395 e->utf8[i][0] = 1;
1396 e->utf8[i][1] = 0;
1397 }
1398 else {
1399 if (c > 0xFFFF)
1400 return 0;
1401 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1402 e->normal.type[i] = BT_NMSTRT;
1403 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1404 e->normal.type[i] = BT_NAME;
1405 else
1406 e->normal.type[i] = BT_OTHER;
1407 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1408 e->utf16[i] = (unsigned short)c;
1409 }
1410 }
1411 e->userData = userData;
1412 e->convert = convert;
1413 if (convert) {
1414 e->normal.isName2 = unknown_isName;
1415 e->normal.isName3 = unknown_isName;
1416 e->normal.isName4 = unknown_isName;
1417 e->normal.isNmstrt2 = unknown_isNmstrt;
1418 e->normal.isNmstrt3 = unknown_isNmstrt;
1419 e->normal.isNmstrt4 = unknown_isNmstrt;
1420 e->normal.isInvalid2 = unknown_isInvalid;
1421 e->normal.isInvalid3 = unknown_isInvalid;
1422 e->normal.isInvalid4 = unknown_isInvalid;
1423 }
1424 e->normal.enc.utf8Convert = unknown_toUtf8;
1425 e->normal.enc.utf16Convert = unknown_toUtf16;
1426 return &(e->normal.enc);
1427}
1428
1429/* If this enumeration is changed, getEncodingIndex and encodings
1430must also be changed. */
1431enum {
1439 /* must match encodingNames up to here */
1440 NO_ENC
1442
1443static const char KW_ISO_8859_1[] = {
1445 ASCII_MINUS, ASCII_1, '\0'
1446};
1447static const char KW_US_ASCII[] = {
1449 '\0'
1450};
1451static const char KW_UTF_8[] = {
1453};
1454static const char KW_UTF_16[] = {
1456};
1457static const char KW_UTF_16BE[] = {
1459 '\0'
1460};
1461static const char KW_UTF_16LE[] = {
1463 '\0'
1464};
1465
1466static int FASTCALL
1467getEncodingIndex(const char *name)
1468{
1469 static const char * const encodingNames[] = {
1470 KW_ISO_8859_1,
1471 KW_US_ASCII,
1472 KW_UTF_8,
1473 KW_UTF_16,
1474 KW_UTF_16BE,
1475 KW_UTF_16LE,
1476 };
1477 int i;
1478 if (name == NULL)
1479 return NO_ENC;
1480 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1481 if (streqci(name, encodingNames[i]))
1482 return i;
1483 return UNKNOWN_ENC;
1484}
1485
1486/* For binary compatibility, we store the index of the encoding
1487 specified at initialization in the isUtf16 member.
1488*/
1489
1490#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1491#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1492
1493/* This is what detects the encoding. encodingTable maps from
1494 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1495 the external (protocol) specified encoding; state is
1496 XML_CONTENT_STATE if we're parsing an external text entity, and
1497 XML_PROLOG_STATE otherwise.
1498*/
1499
1500
1501static int
1502initScan(const ENCODING * const *encodingTable,
1503 const INIT_ENCODING *enc,
1504 int state,
1505 const char *ptr,
1506 const char *end,
1507 const char **nextTokPtr)
1508{
1509 const ENCODING **encPtr;
1510
1511 if (ptr == end)
1512 return XML_TOK_NONE;
1513 encPtr = enc->encPtr;
1514 if (ptr + 1 == end) {
1515 /* only a single byte available for auto-detection */
1516#ifndef XML_DTD /* FIXME */
1517 /* a well-formed document entity must have more than one byte */
1518 if (state != XML_CONTENT_STATE)
1519 return XML_TOK_PARTIAL;
1520#endif
1521 /* so we're parsing an external text entity... */
1522 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1523 switch (INIT_ENC_INDEX(enc)) {
1524 case UTF_16_ENC:
1525 case UTF_16LE_ENC:
1526 case UTF_16BE_ENC:
1527 return XML_TOK_PARTIAL;
1528 }
1529 switch ((unsigned char)*ptr) {
1530 case 0xFE:
1531 case 0xFF:
1532 case 0xEF: /* possibly first byte of UTF-8 BOM */
1533 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1534 && state == XML_CONTENT_STATE)
1535 break;
1536 /* fall through */
1537 case 0x00:
1538 case 0x3C:
1539 return XML_TOK_PARTIAL;
1540 }
1541 }
1542 else {
1543 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1544 case 0xFEFF:
1545 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1546 && state == XML_CONTENT_STATE)
1547 break;
1548 *nextTokPtr = ptr + 2;
1549 *encPtr = encodingTable[UTF_16BE_ENC];
1550 return XML_TOK_BOM;
1551 /* 00 3C is handled in the default case */
1552 case 0x3C00:
1553 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1554 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1555 && state == XML_CONTENT_STATE)
1556 break;
1557 *encPtr = encodingTable[UTF_16LE_ENC];
1558 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1559 case 0xFFFE:
1560 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1561 && state == XML_CONTENT_STATE)
1562 break;
1563 *nextTokPtr = ptr + 2;
1564 *encPtr = encodingTable[UTF_16LE_ENC];
1565 return XML_TOK_BOM;
1566 case 0xEFBB:
1567 /* Maybe a UTF-8 BOM (EF BB BF) */
1568 /* If there's an explicitly specified (external) encoding
1569 of ISO-8859-1 or some flavour of UTF-16
1570 and this is an external text entity,
1571 don't look for the BOM,
1572 because it might be a legal data.
1573 */
1574 if (state == XML_CONTENT_STATE) {
1575 int e = INIT_ENC_INDEX(enc);
1576 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1577 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1578 break;
1579 }
1580 if (ptr + 2 == end)
1581 return XML_TOK_PARTIAL;
1582 if ((unsigned char)ptr[2] == 0xBF) {
1583 *nextTokPtr = ptr + 3;
1584 *encPtr = encodingTable[UTF_8_ENC];
1585 return XML_TOK_BOM;
1586 }
1587 break;
1588 default:
1589 if (ptr[0] == '\0') {
1590 /* 0 isn't a legal data character. Furthermore a document
1591 entity can only start with ASCII characters. So the only
1592 way this can fail to be big-endian UTF-16 if it it's an
1593 external parsed general entity that's labelled as
1594 UTF-16LE.
1595 */
1596 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1597 break;
1598 *encPtr = encodingTable[UTF_16BE_ENC];
1599 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1600 }
1601 else if (ptr[1] == '\0') {
1602 /* We could recover here in the case:
1603 - parsing an external entity
1604 - second byte is 0
1605 - no externally specified encoding
1606 - no encoding declaration
1607 by assuming UTF-16LE. But we don't, because this would mean when
1608 presented just with a single byte, we couldn't reliably determine
1609 whether we needed further bytes.
1610 */
1611 if (state == XML_CONTENT_STATE)
1612 break;
1613 *encPtr = encodingTable[UTF_16LE_ENC];
1614 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1615 }
1616 break;
1617 }
1618 }
1619 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621}
1622
1623
1624#define NS(x) x
1625#define ns(x) x
1626#define XML_TOK_NS_C
1627#include "xmltok_ns.cc"
1628#undef XML_TOK_NS_C
1629#undef NS
1630#undef ns
1631
1632#ifdef XML_NS
1633
1634#define NS(x) x ## NS
1635#define ns(x) x ## _ns
1636
1637#define XML_TOK_NS_C
1638#include "xmltok_ns.cc"
1639#undef XML_TOK_NS_C
1640
1641#undef NS
1642#undef ns
1643
1644ENCODING *
1645XmlInitUnknownEncodingNS(void *mem,
1646 int *table,
1648 void *userData)
1649{
1650 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1651 if (enc)
1652 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1653 return enc;
1654}
1655
1656#endif /* XML_NS */
#define ASCII_l
Definition: ascii.h:43
#define ASCII_i
Definition: ascii.h:40
#define ASCII_F
Definition: ascii.h:10
#define ASCII_o
Definition: ascii.h:46
#define ASCII_E
Definition: ascii.h:9
#define ASCII_C
Definition: ascii.h:7
#define ASCII_O
Definition: ascii.h:19
#define ASCII_Z
Definition: ascii.h:30
#define ASCII_n
Definition: ascii.h:45
#define ASCII_s
Definition: ascii.h:50
#define ASCII_UNDERSCORE
Definition: ascii.h:85
#define ASCII_t
Definition: ascii.h:51
#define ASCII_APOS
Definition: ascii.h:75
#define ASCII_c
Definition: ascii.h:34
#define ASCII_PERIOD
Definition: ascii.h:77
#define ASCII_5
Definition: ascii.h:64
#define ASCII_I
Definition: ascii.h:13
#define ASCII_A
Definition: ascii.h:5
#define ASCII_z
Definition: ascii.h:57
#define ASCII_U
Definition: ascii.h:25
#define ASCII_9
Definition: ascii.h:68
#define ASCII_e
Definition: ascii.h:36
#define ASCII_d
Definition: ascii.h:35
#define ASCII_8
Definition: ascii.h:67
#define ASCII_r
Definition: ascii.h:49
#define ASCII_y
Definition: ascii.h:56
#define ASCII_COLON
Definition: ascii.h:78
#define ASCII_0
Definition: ascii.h:59
#define ASCII_QUOT
Definition: ascii.h:73
#define ASCII_L
Definition: ascii.h:16
#define ASCII_1
Definition: ascii.h:60
#define ASCII_a
Definition: ascii.h:32
#define ASCII_6
Definition: ascii.h:65
#define ASCII_B
Definition: ascii.h:6
#define ASCII_S
Definition: ascii.h:23
#define ASCII_g
Definition: ascii.h:38
#define ASCII_EQUALS
Definition: ascii.h:81
#define ASCII_T
Definition: ascii.h:24
#define ASCII_MINUS
Definition: ascii.h:76
#define ASCII_v
Definition: ascii.h:53
BT_OTHER
Definition: asciitab.h:14
BT_NMSTRT
Definition: asciitab.h:22
BT_NAME
Definition: asciitab.h:16
BT_NONXML
Definition: asciitab.h:5
#define PTRFASTCALL
Definition: internal.h:56
#define FASTCALL
Definition: internal.h:48
#define PTRCALL
Definition: internal.h:52
const char * name(G4int ptype)
const ENCODING ** encPtr
Definition: xmltok.h:261
int minBytesPerChar
Definition: xmltok.h:169
const char const char const char * int(PTRFASTCALL *isNmstrt2)(const ENCODING *
const char const char const char const char * int(PTRFASTCALL *isNmstrt3)(const ENCODING *
int(PTRFASTCALL *isName2)(const ENCODING *
const char * int(PTRFASTCALL *isName3)(const ENCODING *
unsigned char type[256]
Definition: xmltok.cc:183
const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid3)(const ENCODING *
const char const char const char const char const char * int(PTRFASTCALL *isNmstrt4)(const ENCODING *
const char const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid4)(const ENCODING *
const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid2)(const ENCODING *
const char const char * int(PTRFASTCALL *isName4)(const ENCODING *
ENCODING enc
Definition: xmltok.cc:182
unsigned short utf16[256]
Definition: xmltok.cc:1256
CONVERTER convert
Definition: xmltok.cc:1254
void * userData
Definition: xmltok.cc:1255
char utf8[256][4]
Definition: xmltok.cc:1257
struct normal_encoding normal
Definition: xmltok.cc:1253
BT_LEAD3
Definition: utf8tab.h:30
BT_LEAD4
Definition: utf8tab.h:34
BT_TRAIL
Definition: utf8tab.h:6
BT_MALFORM
Definition: utf8tab.h:37
BT_LEAD2
Definition: utf8tab.h:22
#define userData
Definition: xmlparse.cc:572
#define XmlInitUnknownEncodingNS
Definition: xmlparse.cc:57
#define BIG2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:789
#define STANDARD_VTABLE(E)
Definition: xmltok.cc:215
#define VTABLE1
Definition: xmltok.cc:37
#define UTF8_GET_NAMING3(pages, byte)
Definition: xmltok.cc:71
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:791
#define DEFINE_UTF16_TO_UTF8(E)
Definition: xmltok.cc:543
#define BIG2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:785
#define NORMAL_VTABLE(E)
Definition: xmltok.cc:219
#define INIT_ENC_INDEX(enc)
Definition: xmltok.cc:1490
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData)
Definition: xmltok.cc:1351
int XmlSizeOfUnknownEncoding(void)
Definition: xmltok.cc:1263
#define UTF8_INVALID2(p)
Definition: xmltok.cc:96
#define SB_BYTE_TYPE(enc, p)
Definition: xmltok.cc:247
#define VTABLE
Definition: xmltok.cc:51
#define LITTLE2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:649
@ NO_ENC
Definition: xmltok.cc:1440
@ US_ASCII_ENC
Definition: xmltok.cc:1434
@ ISO_8859_1_ENC
Definition: xmltok.cc:1433
@ UTF_8_ENC
Definition: xmltok.cc:1435
@ UTF_16_ENC
Definition: xmltok.cc:1436
@ UNKNOWN_ENC
Definition: xmltok.cc:1432
@ UTF_16BE_ENC
Definition: xmltok.cc:1437
@ UTF_16LE_ENC
Definition: xmltok.cc:1438
@ UTF8_cval4
Definition: xmltok.cc:323
@ UTF8_cval1
Definition: xmltok.cc:320
@ UTF8_cval2
Definition: xmltok.cc:321
@ UTF8_cval3
Definition: xmltok.cc:322
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:793
#define LITTLE2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:644
#define BT_COLON
#define UTF8_INVALID4(p)
Definition: xmltok.cc:116
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:650
#define AS_UNKNOWN_ENCODING(enc)
Definition: xmltok.cc:1260
#define UCS2_GET_NAMING(pages, hi, lo)
Definition: xmltok.cc:53
#define UTF8_GET_NAMING2(pages, byte)
Definition: xmltok.cc:60
int FASTCALL XmlUtf16Encode(int charNum, unsigned short *buf)
Definition: xmltok.cc:1235
#define LITTLE2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:648
#define UTF8_INVALID3(p)
Definition: xmltok.cc:99
#define DEFINE_UTF16_TO_UTF16(E)
Definition: xmltok.cc:606
#define BIG2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:790
#define AS_NORMAL_ENCODING(enc)
Definition: xmltok.cc:202
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:652
int FASTCALL XmlUtf8Encode(int c, char *buf)
Definition: xmltok.cc:1198
#define XML_CONTENT_STATE
Definition: xmltok.h:95
#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim)
Definition: xmltok.h:253
#define XML_UTF8_ENCODE_MAX
Definition: xmltok.h:106
#define XML_TOK_PARTIAL
Definition: xmltok.h:20
#define XmlNameMatchesAscii(enc, ptr1, end1, ptr2)
Definition: xmltok.h:229
int(XMLCALL * CONVERTER)(void *userData, const char *p)
Definition: xmltok.h:283
#define XML_TOK_NONE
Definition: xmltok.h:17
#define XmlTok(enc, state, ptr, end, nextTokPtr)
Definition: xmltok.h:196
#define XML_TOK_BOM
Definition: xmltok.h:43
@ BT_NONASCII
Definition: xmltok_impl.h:36