domfs

Document Object Model as a filesystem for plan9 os
git clone git://nsmpr.xyz/domfs.git
Log | Files | Refs | README

tok.c (31763B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <String.h>
      4 #include <thread.h>
      5 
      6 #include "html5dom.h"
      7 #include "ncref.h"
      8 
      9 #define ALPHA(x) ((x >=0x41) && (x <= 0x7a))
     10 #define DIGIT(x) ((x >=0x30) && (x <= 0x39))
     11 
     12 Channel *outchannel;
     13 
     14 int gc(void);
     15 
     16 
     17 
     18 Token*
     19 eoftok(void)
     20 {
     21 	Token *t;
     22 	t = mallocz(sizeof(Token), 1);
     23 	t->type = TEOF;
     24 	return t;
     25 }
     26 
     27 Token*
     28 chartok(Rune c)
     29 {
     30 	Token *t;
     31 	t = mallocz(sizeof(Token), 1);
     32 	t->c = c;
     33 	t->type = TCHAR;
     34 	return t;
     35 }
     36 
     37 Token*
     38 newtok(int type)
     39 {
     40 	Token *nt;
     41 	nt = mallocz(sizeof(Token), 1);
     42 	nt->type = type;
     43 	nt->name = s_new();
     44 	nt->attr = nil;
     45 	return nt;
     46 }
     47 
     48 void
     49 t_free(Token *t)
     50 {
     51 	s_free(t->name);
     52 	free(t);
     53 }
     54 
     55 
     56 Attr*
     57 tnewattr(Token *t)
     58 {
     59 	int n;
     60 	if (t->attr == nil) t->attr = mallocz(sizeof(Attr*), 1);
     61 	for (n=0; (t->attr)[n] != nil; n++);
     62 	t->attr = realloc(t->attr, (n + 1) * sizeof(Attr*));
     63 	t->attr[n-1] = mallocz(sizeof(Attr), 1);
     64 	t->attr[n-1]->name = s_new();
     65 	t->attr[n-1]->value = s_new();
     66 	t->attr[n] = nil;
     67 	return t->attr[n-1];
     68 }
     69 
     70 void
     71 attr_free(Attr *attr)
     72 {
     73 	s_free(attr->name);
     74 	s_free(attr->value);
     75 	free(attr);
     76 }
     77 
     78 u32int insertion_mode = IMinitial;
     79 
     80 /* Tokenizer vars and funcs */
     81 
     82 Rune tc;
     83 int treconsume = 0;
     84 int teof;
     85 
     86 Token *ctoken;
     87 Attr *cattr;
     88 String *ctempbuf;
     89 String *clookaheadbuf;
     90 
     91 void tconsume(void);
     92 void temit(Token*);
     93 void temitbuf(String*);
     94 int talpha(int);
     95 
     96 void tsdata(void);
     97 void tsrcdt(void);
     98 void tsrawt(void);
     99 void tsscript(void);
    100 void tsptxt(void);
    101 void tstagopen(void);
    102 void tsetagopen(void);
    103 void tstagname(void);
    104 void tsrcdtless(void);
    105 void tsrcdtendopen(void);
    106 void tsrcdtendname(void);
    107 void tsrawtless(void);
    108 void tsrawtendopen(void);
    109 void tsrawtendname(void);
    110 void tsscriptless(void);
    111 void tsscriptendopen(void);
    112 void tsscriptendname(void);
    113 
    114 void tsscriptescstart(void);
    115 void tsscriptescstartdash(void);
    116 void tsscriptesc(void);
    117 void tsscriptescdash(void);
    118 void tsscriptescddash(void);
    119 void tsscriptescless(void);
    120 void tsscriptescendopen(void);
    121 void tsscriptescendname(void);
    122 void tsscriptdescstart(void);
    123 void tsscriptdesc(void);
    124 void tsscriptdescdash(void);
    125 void tsscriptdescddash(void);
    126 void tsscriptdescless(void);
    127 void tsscriptdescend(void);
    128 
    129 void tsanamebefore(void);
    130 void tsaname(void);
    131 void tsanameafter(void);
    132 void tsavalbefore(void);
    133 void tsavaldq(void);
    134 void tsavalsq(void);
    135 void tsavaluq(void);
    136 void tsavalafter(void);
    137 void tsscstag(void);
    138 void tsboguscomment(void);
    139 void tsmkupopen(void);
    140 void tscommentstart(void);
    141 void tscommentstartdash(void);
    142 void tscomment(void);
    143 void tscommentless(void);
    144 void tscommentlessbang(void);
    145 void tscommentlessbangdash(void);
    146 void tscommentlessbangddash(void);
    147 void tscommentenddash(void);
    148 void tscommentend(void);
    149 void tscommentendbang(void);
    150 void tsdoct(void);
    151 void tsdoctbefore(void);
    152 void tsdoctname(void);
    153 void tsdoctnameafter(void);
    154 void tsdoctpubkafter(void);
    155 void tsdoctpubidbefore(void);
    156 void tsdoctpubiddq(void);
    157 void tsdoctpubidsq(void);
    158 void tsdoctpubidafter(void);
    159 void tsdoctbetween(void);
    160 void tsdoctsyskafter(void);
    161 void tsdoctsysidbefore(void);
    162 void tsdoctsysiddq(void);
    163 void tsdoctsysidsq(void);
    164 void tsdoctsysidafter(void);
    165 void tsdoctbogus(void);
    166 void tscdat(void);
    167 void tscdatbrk(void);
    168 void tscdatend(void);
    169 void tscref(void);
    170 void tsncref(void);
    171 void tsamam(void);
    172 void tsnumref(void);
    173 void tshexrefstart(void);
    174 void tsdecrefstart(void);
    175 void tshexref(void);
    176 void tsdecref(void);
    177 void tsnumrefend(void);
    178 
    179 
    180 #define REPCHAR Runeerror  /* replacement character */
    181 
    182 enum {
    183 	TSDATA,                  /* data                                      */
    184 	TSRCDT,                  /* RCDATA                                    */
    185 	TSRAWT,                  /* RAWTEXT                                   */
    186 	TSSCRIPT,                /* script data                               */
    187 	TSPTXT,                  /* PLAINTEXT                                 */
    188 	TSTAG_OPEN,              /* tag open                                  */
    189 	TSETAG_OPEN,             /* end tag open                              */
    190 	TSTAG_NAME,              /* tag name                                  */
    191 	TSRCDT_LESS,             /* RCDATA less-than sign                     */
    192 	TSRCDT_END_OPEN,         /* RCDATA end tag open                       */
    193 	TSRCDT_END_NAME,         /* RCDATA end tag name                       */
    194 	TSRAWT_LESS,             /* RAWTEXT less-than sign                    */
    195 	TSRAWT_END_OPEN,         /* RAWTEXT end tag open                      */
    196 	TSRAWT_END_NAME,         /* RAWTEXT end tag name                      */
    197 	TSSCRIPT_LESS,           /* script data less-than sign                */
    198 	TSSCRIPT_END_OPEN,       /* script data end tag open                  */
    199 	TSSCIRPT_END_NAME,       /* script data end tag name                  */
    200 	TSSCRIPT_ESC_START,      /* scirpt data escape start                  */
    201 	TSSCRIPT_ESC_START_DASH, /* scirpt data escape start dash             */
    202 	TSSCRIPT_ESC,            /* scirpt data escaped                       */
    203 	TSSCRIPT_ESC_DASH,       /* scirpt data escaped dash                  */
    204 
    205 	TSSCRIPT_ESC_DDASH,      /* scirpt data escaped dash dash             */
    206 	TSSCRIPT_ESC_LESS,       /* scirpt data escaped less-than sign        */
    207 	TSSCRIPT_ESC_END_OPEN,   /* scirpt data escaped end tag open          */
    208 	TSSCRIPT_ESC_END_NAME,   /* scirpt data escaped end tag name          */
    209 	TSSCRIPT_DESC_START,     /* scirpt data double escape start           */
    210 	TSSCRIPT_DESC,           /* scirpt data double escaped                */
    211 	TSSCRIPT_DESC_DASH,      /* scirpt data double escaped dash           */
    212 	TSSCRIPT_DESC_DDASH,     /* scirpt data double escaped dash dash      */
    213 	TSSCRIPT_DESC_LESS,      /* scirpt data double escaped less-than sign */
    214 	TSSCRIPT_DESC_END,       /* scirpt data double escape end             */
    215 
    216 	TSANAME_BEFORE,          /* Before attribute name           */
    217 	TSANAME,                 /* Attribute name                  */
    218 	TSANAME_AFTER,           /* After attribute name            */
    219 	TSAVAL_BEFORE,          /* Before attribute value          */
    220 	TSAVAL_DQ,               /* Attribute value (double-quoted) */
    221 	TSAVAL_SQ,               /* Attribute value (single-quoted) */
    222 	TSAVAL_UQ,               /* Attribute value (unquoted)      */
    223 	TSAVAL_AFTER,            /* After attribute value (quoted)  */
    224 	
    225 	TSSCSTAG,        /* Self-closing start tag  */
    226 	TSBOGUS_COMMENT, /* Bogus comment           */
    227 	TSMKUP_OPEN,     /* Markup declaration open */
    228 	
    229 	TSCOMMENT_START,           /* Comment start                         */
    230 	TSCOMMENT_START_DASH,      /* Comment start dash                    */
    231 	TSCOMMENT,                 /* Comment                               */
    232 	TSCOMMENT_LESS,            /* Comment less-than sign                */
    233 	TSCOMMENT_LESS_BANG,       /* Comment less-than sign bang           */
    234 	TSCOMMENT_LESS_BANG_DASH,  /* Comment less-than sign bang dash      */
    235 	TSCOMMENT_LESS_BANG_DDASH, /* Comment less-than sign bang dash dash */
    236 	TSCOMMENT_END_DASH,        /* Comment end dash                      */
    237 	TSCOMMENT_END,             /* Comment end                           */
    238 	TSCOMMENT_END_BANG,        /* Comment end bang                      */
    239 	
    240 	TSDOCT,              /* DOCTYPE                                       */
    241 	TSDOCT_BEFORE,       /* Before DOCTYPE name                           */
    242 	TSDOCT_NAME,         /* DOCTYPE name                                  */
    243 	TSDOCT_NAME_AFTER,   /* After DOCTYPE name                            */
    244 	TSDOCT_PUBK_AFTER,   /* After DOCTYPE public keyword                  */
    245 	TSDOCT_PUBID_BEFORE, /* Before DOCTYPE public identifier              */
    246 	TSDOCT_PUBID_DQ,     /* DOCTYPE public identifier (double-quoted)     */
    247 	TSDOCT_PUBID_SQ,     /* DOCTYPE public identifier (single-quoted)     */
    248 	TSDOCT_PUBID_AFTER,  /* After DOCTYPE public identifier               */
    249 	TSDOCT_BETWEEN,      /* Between DOCTYPE public and system identifiers */
    250 	TSDOCT_SYSK_AFTER,   /* After DOCTYPE system keyword                  */
    251 	TSDOCT_SYSID_BEFORE, /* Before DOCTYPE system identifier              */
    252 	TSDOCT_SYSID_DQ,     /* DOCTYPE system identifier (double-quoted)     */
    253 	TSDOCT_SYSID_SQ,     /* DOCTYPE system identifier (single-quoted)     */
    254 	TSDOCT_SYSID_AFTER,  /* After DOCTYPE system identifier               */
    255 	TSDOCT_BOGUS,        /* Bogus DOCTYPE                                 */
    256 	
    257 	TSCDAT,     /* CDATA section         */
    258 	TSCDAT_BRK, /* CDATA section bracket */
    259 	TSCDAT_END, /* CDATA section end     */
    260 	
    261 	TSCREF,         /* Character reference                   */
    262 	TSNCREF,        /* Named character reference             */
    263 	TSAMAM,         /* Ambiguous ampersand                   */
    264 	TSNUMREF,       /* Numeric character reference           */
    265 	TSHEXREF_START, /* Hexadecimal character reference start */
    266 	TSDECREF_START, /* Decimal character reference start     */
    267 	TSHEXREF,       /* Hexadecimal character reference       */
    268 	TSDECREF,       /* Decimal character reference           */
    269 	TSNUMREF_END,   /* Numeric character reference end       */
    270 	
    271 	TMAX,
    272 };
    273 
    274 void (*tstab[])(void) = {
    275 	[TSDATA]            = tsdata,
    276 	[TSRCDT]            = tsrcdt,
    277 	[TSRAWT]            = tsrawt,
    278 	[TSSCRIPT]          = tsscript,
    279 	[TSPTXT]            = tsptxt,
    280 	[TSTAG_OPEN]        = tstagopen,
    281 	[TSETAG_OPEN]       = tsetagopen,
    282 	[TSTAG_NAME]        = tstagname,
    283 	[TSRCDT_LESS]       = tsrcdtless,
    284 	[TSRCDT_END_OPEN]   = tsrcdtendopen,
    285 	[TSRCDT_END_NAME]   = tsrcdtendname,
    286 	[TSRAWT_LESS]       = tsrawtless,
    287 	[TSRAWT_END_OPEN]   = tsrawtendopen,
    288 	[TSSCRIPT_LESS]     = tsscriptless,
    289 	[TSSCRIPT_END_OPEN] = tsscriptendopen,
    290 	[TSSCIRPT_END_NAME] = tsscriptendname,
    291 	[TSSCRIPT_ESC_START] = tsscriptesc,
    292 	[TSSCRIPT_ESC_START_DASH] = tsscriptesc,
    293 	[TSSCRIPT_ESC] = tsscriptesc,
    294 	[TSSCRIPT_ESC_DASH] = tsscriptescdash,
    295 	[TSSCRIPT_ESC_DDASH] = tsscriptescddash,
    296 	[TSSCRIPT_ESC_LESS] = tsscriptescless,
    297 	[TSSCRIPT_ESC_END_OPEN] = tsscriptescendopen,
    298 	[TSSCRIPT_ESC_END_NAME] = tsscriptescendname,
    299 	[TSSCRIPT_DESC_START] = tsscriptdescstart,
    300 	[TSSCRIPT_DESC] = tsscriptdesc,
    301 	[TSSCRIPT_DESC_DASH] = tsscriptdescdash,
    302 	[TSSCRIPT_DESC_DDASH] = tsscriptdescddash,
    303 	[TSSCRIPT_DESC_LESS] = tsscriptdescless,
    304 	[TSSCRIPT_DESC_END] = tsscriptdescend,
    305 
    306 	[TSANAME_BEFORE] = tsanamebefore,
    307 	[TSANAME]        = tsaname,
    308 	[TSANAME_AFTER] = tsanameafter,
    309 	[TSAVAL_BEFORE] = tsavalbefore,
    310 	[TSAVAL_DQ] = tsavaldq,
    311 	[TSAVAL_SQ] = tsavalsq,
    312 	[TSAVAL_UQ] = tsavaluq,
    313 	[TSAVAL_AFTER] = tsavalafter,
    314 	[TSSCSTAG] = tsscstag,
    315 	[TSBOGUS_COMMENT] = tsboguscomment,
    316 	[TSMKUP_OPEN] = tsmkupopen,
    317 	[TSCOMMENT_START] = tscommentstart,
    318 	[TSCOMMENT_START_DASH] = tscommentstartdash,
    319 	[TSCOMMENT] = tscomment,
    320 	[TSCOMMENT_LESS] = tscommentless,
    321 	[TSCOMMENT_LESS_BANG] = tscommentlessbang,
    322 	[TSCOMMENT_LESS_BANG_DASH] = tscommentlessbangdash,
    323 	[TSCOMMENT_LESS_BANG_DDASH] = tscommentlessbangddash,
    324 	[TSCOMMENT_END_DASH] = tscommentenddash,
    325 	[TSCOMMENT_END] = tscommentend,
    326 	[TSCOMMENT_END_BANG] = tscommentendbang,
    327 	[TSDOCT] = tsdoct,
    328 	[TSDOCT_BEFORE] = tsdoctbefore,
    329 	[TSDOCT_NAME] = tsdoctname,
    330 	[TSDOCT_NAME_AFTER] = tsdoctnameafter,
    331 	[TSDOCT_PUBK_AFTER] = tsdoctpubkafter,
    332 	[TSDOCT_PUBID_BEFORE] = tsdoctpubidbefore,
    333 	[TSDOCT_PUBID_DQ] = tsdoctpubiddq,
    334 	[TSDOCT_PUBID_SQ] = tsdoctpubidsq,
    335 	[TSDOCT_PUBID_AFTER] = tsdoctpubidafter,
    336 	[TSDOCT_BETWEEN] = tsdoctbetween,
    337 	[TSDOCT_SYSK_AFTER] = tsdoctsyskafter,
    338 	[TSDOCT_SYSID_BEFORE] = tsdoctsysidbefore,
    339 	[TSDOCT_SYSID_DQ] = tsdoctsysiddq,
    340 	[TSDOCT_SYSID_SQ] = tsdoctsysidsq,
    341 	[TSDOCT_SYSID_AFTER] = tsdoctsysidafter,
    342 	[TSDOCT_BOGUS] = tsdoctbogus,
    343 	[TSCDAT] = tscdat,
    344 	[TSCDAT_BRK] = tscdatbrk,
    345 	[TSCDAT_END] = tscdatend,
    346 	[TSCREF] = tscref,
    347 	[TSNCREF] = tsncref,
    348 	[TSAMAM] = tsamam,
    349 	[TSNUMREF] = tsnumref,
    350 	[TSHEXREF_START] = tshexrefstart,
    351 	[TSDECREF_START] = tsdecrefstart,
    352 	[TSHEXREF] = tshexref,
    353 	[TSDECREF] = tsdecref,
    354 	[TSNUMREF_END] = tsnumrefend,
    355 };
    356 
    357 int tstate = TSDATA;
    358 int treturn = -1;
    359 
    360 void
    361 tsanamebefore(void)
    362 {
    363 	switch (tc) {
    364 	case '\t':
    365 	case '\n':
    366 	case '\r':
    367 	case ' ':
    368 		break;
    369 	case '/':
    370 	case '>':
    371 	case  -1:
    372 		treconsume = 1;
    373 		tstate = TSANAME_AFTER;
    374 		break;
    375 	case '=':
    376 		fprint(2, "unexpected equals sign before attribute name parse error, tc='%c'\n", tc);
    377 		cattr = tnewattr(ctoken);
    378 		s_putc(cattr->name, tc);
    379 		tstate = TSANAME;
    380 		break;
    381 	default:
    382 		cattr = tnewattr(ctoken);
    383 		treconsume = 1;
    384 		tstate = TSANAME;
    385 	}
    386 }
    387 
    388 void
    389 tsaname(void)
    390 {
    391 	if (ALPHA(tc) != 0) {
    392 		if (tc < 'a') tc += 0x20;
    393 	}
    394 	switch (tc) {
    395 	case '\t':
    396 	case '\n':
    397 	case '\r':
    398 	case ' ':
    399 	case '/':
    400 	case '>':
    401 	case -1:
    402 		treconsume = 1;
    403 		s_terminate(cattr->name);
    404 		tstate = TSANAME_AFTER;
    405 		break;
    406 	case '=':
    407 		tstate = TSAVAL_BEFORE;
    408 		break;
    409 	case '\0':
    410 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
    411 		s_putc(cattr->name, REPCHAR);
    412 		break;
    413 	case '"':
    414 	case '\'':
    415 	case '<':
    416 		fprint(2, "unexpected character in attribute name parse error, tc='%c'\n", tc);
    417 	default:
    418 		s_putc(cattr->name, tc);
    419 	}
    420 	/* TODO check for duplicate attribute names on leaving or emitting */
    421 }
    422 
    423 void
    424 tsanameafter(void)
    425 {
    426 	switch (tc) {
    427 	case '\t':
    428 	case '\n':
    429 	case '\r':
    430 	case ' ':
    431 		break;
    432 	case '/':
    433 		tstate = TSSCSTAG;
    434 		break;
    435 	case '=':
    436 		tstate = TSAVAL_BEFORE;
    437 		break;
    438 	case '>':
    439 		tstate = TSDATA;
    440 		s_terminate(ctoken->name);
    441 		temit(ctoken);
    442 		break;
    443 	case -1: /* EOF */
    444 		fprint(2, "eof in tag parse error\n");
    445 		temit(eoftok());
    446 		break;
    447 	default:
    448 		cattr = tnewattr(ctoken);
    449 		treconsume = 1;
    450 		tstate = TSANAME;
    451 	}
    452 }
    453 
    454 void
    455 tsavalbefore(void)
    456 {
    457 	switch (tc) {
    458 	case '\t':
    459 	case '\n':
    460 	case '\r':
    461 	case ' ':
    462 		break;
    463 	case '"':
    464 		tstate = TSAVAL_DQ;
    465 		break;
    466 	case '\'':
    467 		tstate = TSAVAL_SQ;
    468 		break;
    469 	case '>':
    470 		fprint(2, "missing attribute value parse error\n");
    471 		s_terminate(ctoken->name);
    472 		temit(ctoken);
    473 		tstate = TSDATA;
    474 		break;
    475 	default:
    476 		treconsume = 1;
    477 		tstate = TSAVAL_UQ;
    478 	}
    479 }
    480 
    481 void
    482 tsavaldq(void)
    483 {
    484 	switch (tc) {
    485 	case '"':
    486 		tstate = TSAVAL_AFTER;
    487 		break;
    488 	case '&':
    489 		treturn = TSAVAL_DQ;
    490 		tstate = TSCREF;
    491 		break;
    492 	case '\0':
    493 		fprint(2, "unexpected null character parse error\n");
    494 		s_putc(cattr->value, REPCHAR);
    495 		break;
    496 	case -1: /* EOF */
    497 		fprint(2, "oef in tag parse error\n");
    498 		temit(eoftok());
    499 		break;
    500 	default:
    501 		s_putc(cattr->value, tc);
    502 	}		
    503 }
    504 
    505 void
    506 tsavalsq(void)
    507 {
    508 	switch (tc) {
    509 	case '\'':
    510 		tstate = TSAVAL_AFTER;
    511 		break;
    512 	case '&':
    513 		treturn = TSAVAL_SQ;
    514 		tstate = TSCREF;
    515 		break;
    516 	case '\0':
    517 		fprint(2, "unexpected null character parse error\n");
    518 		s_putc(cattr->value, REPCHAR);
    519 		break;
    520 	case -1: /* EOF */
    521 		fprint(2, "oef in tag parse error\n");
    522 		temit(eoftok());
    523 		break;
    524 	default:
    525 		s_putc(cattr->value, tc);
    526 	}
    527 }
    528 
    529 void
    530 tsavaluq(void)
    531 {
    532 	switch (tc) {
    533 	case '\t':
    534 	case '\n':
    535 	case '\r':
    536 	case ' ':
    537 		s_terminate(cattr->value);
    538 		tstate = TSANAME_BEFORE;
    539 		break;
    540 	case '&':
    541 		treturn = TSAVAL_UQ;
    542 		tstate = TSCREF;
    543 		break;
    544 	case '>':
    545 		s_terminate(ctoken->name);
    546 		s_terminate(cattr->value);
    547 		tstate = TSDATA;
    548 		break;
    549 	case '\0':
    550 		fprint(2, "unexpected null character parse error\n");
    551 		s_putc(cattr->value, REPCHAR);
    552 		break;
    553 	case -1: /* EOF */
    554 		fprint(2, "oef in tag parse error\n");
    555 		temit(eoftok());
    556 		break;	case '"':
    557 	case '\'':
    558 	case '<':
    559 	case '=':
    560 	case '`':
    561 		fprint(2, "unexpected character in unquoted attribute value parse error\n");
    562 	default:
    563 		s_putc(cattr->value, tc);
    564 	}
    565 }
    566 
    567 void
    568 tsavalafter(void)
    569 {
    570 	switch (tc) {
    571 	case '\t':
    572 	case '\n':
    573 	case '\r':
    574 	case ' ':
    575 		tstate = TSANAME_BEFORE;
    576 		break;
    577 	case '/':
    578 		ctoken->flags |=  TSSCSTAG;
    579 		break;
    580 	case '>':
    581 		s_terminate(ctoken->name);
    582 		s_terminate(cattr->value);
    583 		temit(ctoken);
    584 		tstate = TSDATA;
    585 		break;
    586 	case -1: /* EOF */
    587 		fprint(2, "eof in tag parse error\n");
    588 		temit(eoftok());
    589 		break;
    590 	default:
    591 		fprint(2, "missing whitespace between attributes parse error\n");
    592 		treconsume = 1;
    593 		tstate = TSANAME_BEFORE;
    594 	}
    595 }
    596 
    597 void
    598 tsscstag(void)
    599 {
    600 	switch (tc) {
    601 	case '>':
    602 		ctoken->flags |= TF_SELF_CLOSING;
    603 		tstate = TSDATA;
    604 		temit(ctoken);
    605 		break;
    606 	case -1:
    607 		fprint(2, "eof in tag parse error\n");
    608 		temit(eoftok());
    609 		break;
    610 	default:
    611 		fprint(2, "unxpected solidus in tag parse error\n");
    612 		treconsume = 1;
    613 		tstate = TSANAME_BEFORE;
    614 	}
    615 }
    616 
    617 void
    618 tsboguscomment(void)
    619 {
    620 	fprint(2, "tsboguscomment not implemented\n");
    621 	tstate = TSDATA;
    622 }
    623 
    624 void
    625 tsmkupopen(void)
    626 {
    627 	int i;
    628 	String *mbuf, *lowered;
    629 	mbuf = s_new();
    630 	s_putc(mbuf, tc);
    631 	tconsume();
    632 	s_putc(mbuf, tc);
    633 	if (strncmp(s_to_c(mbuf), "--", 2) == 0) {
    634 		ctoken = newtok(TCOMM);
    635 		tstate = TSCOMMENT_START;
    636 		s_free(mbuf);
    637 		return;
    638 	}
    639 	for (i = 0; i < 5; i++) {
    640 		tconsume();
    641 		s_putc(mbuf, tc);
    642 	}
    643 	if (strncmp(s_to_c(mbuf), "[CDATA[", 7) == 0) {
    644 		/* TODO: check if adjusted current node */
    645 		tstate = TSCDAT;
    646 		s_free(mbuf);
    647 		return;
    648 	}
    649 	lowered = s_copy(s_to_c(mbuf));
    650 	s_tolower(lowered);
    651 	if (strncmp(s_to_c(lowered), "doctype", 7) == 0) {
    652 		tstate = TSDOCT;
    653 		s_free(mbuf);
    654 		s_free(lowered);
    655 		return;
    656 	}
    657 	fprint(2, "incorrectly opened comment parse error, tc='%c'\n", tc);
    658 	ctoken = newtok(TCOMM);
    659 	tstate = TSBOGUS_COMMENT;
    660 	s_append(clookaheadbuf, s_to_c(mbuf));
    661 	s_free(lowered);
    662 	s_free(mbuf);
    663 }
    664 
    665 void
    666 tscommentstart(void)
    667 {
    668 	fprint(2, "tscommentstart not implemented\n");
    669 	tstate = TSDATA;
    670 }
    671 
    672 void
    673 tscommentstartdash(void)
    674 {
    675 	fprint(2, "tscommentstartdash not implemented\n");
    676 	tstate = TSDATA;
    677 }
    678 
    679 void
    680 tscomment(void)
    681 {
    682 	fprint(2, "tscomment not implemented\n");
    683 	tstate = TSDATA;
    684 }
    685 
    686 void
    687 tscommentless(void)
    688 {
    689 	fprint(2, "tscommentless not implemented\n");
    690 	tstate = TSDATA;
    691 }
    692 
    693 void
    694 tscommentlessbang(void)
    695 {
    696 	fprint(2, "tscommentlessbang not implemented\n");
    697 	tstate = TSDATA;
    698 }
    699 
    700 void
    701 tscommentlessbangdash(void)
    702 {
    703 	fprint(2, "tscommentlessbangdash not implemented\n");
    704 	tstate = TSDATA;
    705 }
    706 
    707 void
    708 tscommentlessbangddash(void)
    709 {
    710 	fprint(2, "tscommentlessbangddash not implemented\n");
    711 	tstate = TSDATA;
    712 }
    713 
    714 void
    715 tscommentenddash(void)
    716 {
    717 	fprint(2, "tscommentenddash not implemented\n");
    718 	tstate = TSDATA;
    719 }
    720 
    721 void
    722 tscommentend(void)
    723 {
    724 	fprint(2, "tscommentend not implemented\n");
    725 	tstate = TSDATA;
    726 }
    727 
    728 void
    729 tscommentendbang(void)
    730 {
    731 	fprint(2, "tscommentendbang not implemented\n");
    732 	tstate = TSDATA;
    733 }
    734 
    735 void
    736 tsdoct(void)
    737 {
    738 	switch (tc) {
    739 	case '\t':
    740 	case '\n':
    741 	case '\r':
    742 	case ' ':
    743 		tstate = TSDOCT_BEFORE;
    744 		break;
    745 	case '>':
    746 		treconsume = 1;
    747 		tstate = TSDOCT_BEFORE;
    748 		break;
    749 	case -1: /* eof */
    750 		fprint(2, "eof in doctype parse error, tc='%c'\n", tc);
    751 		ctoken = newtok(TDOCT);
    752 		ctoken->flags |= TF_FORCE_QUIRKS;
    753 		s_terminate(ctoken->name);
    754 		temit(ctoken);
    755 		break;
    756 	default:
    757 		fprint(2, "missing whitespace before doctype name parse error, tc='%c'\n", tc);
    758 		treconsume = 1;
    759 		tstate = TSDOCT_BEFORE;
    760 	}
    761 }
    762 
    763 void
    764 tsdoctbefore(void)
    765 {
    766 	switch (tc) {
    767 	case '\t':
    768 	case '\n':
    769 	case '\r':
    770 	case ' ':
    771 		break;
    772 	case '\0':
    773 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
    774 		ctoken = newtok(TDOCT);
    775 		s_putc(ctoken->name, REPCHAR);
    776 		tstate = TSDOCT_NAME;
    777 		break;
    778 	case '>':
    779 		fprint(2, "missing doctype name parse error, tc='%c'\n", tc);
    780 		ctoken = newtok(TDOCT);
    781 		ctoken->flags |= TF_FORCE_QUIRKS;
    782 		s_terminate(ctoken->name);
    783 		temit(ctoken);
    784 		break;
    785 	case -1: /* EOF */
    786 		fprint(2, "eof in doctype parse error, tc='%c'\n", tc);
    787 		ctoken = newtok(TDOCT);
    788 		ctoken->flags |= TF_FORCE_QUIRKS;
    789 		s_terminate(ctoken->name);
    790 		temit(ctoken);
    791 		temit(eoftok());
    792 		break;
    793 	default:
    794 		if (tc < 'a') tc += 0x20;
    795 		ctoken = newtok(TDOCT);
    796 		s_putc(ctoken->name, tc);
    797 		tstate = TSDOCT_NAME;
    798 	}
    799 }
    800 
    801 void
    802 tsdoctname(void)
    803 {
    804 	switch (tc) {
    805 	case '\t':
    806 	case '\n':
    807 	case '\r':
    808 	case ' ':
    809 		tstate = TSDOCT_NAME_AFTER;
    810 		break;
    811 	case '>':
    812 		tstate = TSDATA;
    813 		s_terminate(ctoken->name);
    814 		temit(ctoken);
    815 		break;
    816 	case '\0':
    817 		fprint(2, "unexpected null character parse error\n");
    818 		s_putc(ctoken->name, REPCHAR);
    819 		break;
    820 	case -1: /* EOF */
    821 		fprint(2, "eof in doctype parse error\n");
    822 		ctoken->flags |= TF_FORCE_QUIRKS;
    823 		s_terminate(ctoken->name);
    824 		temit(ctoken);
    825 		temit(eoftok());
    826 		break;
    827 	default:
    828 		talpha(1);
    829 	}
    830 }
    831 
    832 void
    833 tsdoctnameafter(void)
    834 {
    835 	fprint(2, "tsdoctnameafter not implemented\n");
    836 	tstate = TSDATA;
    837 }
    838 
    839 void
    840 tsdoctpubkafter(void)
    841 {
    842 	fprint(2, "tsdoctpubkafter not implemented\n");
    843 	tstate = TSDATA;
    844 }
    845 
    846 void
    847 tsdoctpubidbefore(void)
    848 {
    849 	fprint(2, "tsdoctpubidbefore not implemented\n");
    850 	tstate = TSDATA;
    851 }
    852 
    853 void
    854 tsdoctpubiddq(void)
    855 {
    856 	fprint(2, "tsdoctpubiddq not implemented\n");
    857 	tstate = TSDATA;
    858 }
    859 
    860 void
    861 tsdoctpubidsq(void)
    862 {
    863 	fprint(2, "tsdoctpubidsq not implemented\n");
    864 	tstate = TSDATA;
    865 }
    866 
    867 void
    868 tsdoctpubidafter(void)
    869 {
    870 	fprint(2, "tsdoctpubidafter not implemented\n");
    871 	tstate = TSDATA;
    872 }
    873 
    874 void
    875 tsdoctbetween(void)
    876 {
    877 	fprint(2, "tsdoctbetween not implemented\n");
    878 	tstate = TSDATA;
    879 }
    880 
    881 void
    882 tsdoctsyskafter(void)
    883 {
    884 	fprint(2, "tsdoctsyskafter not implemented\n");
    885 	tstate = TSDATA;
    886 }
    887 
    888 void
    889 tsdoctsysidbefore(void)
    890 {
    891 	fprint(2, "tsdoctsysidbefore not implemented\n");
    892 	tstate = TSDATA;
    893 }
    894 
    895 void
    896 tsdoctsysiddq(void)
    897 {
    898 	fprint(2, "tsdoctsysiddq not implemented\n");
    899 	tstate = TSDATA;
    900 }
    901 
    902 void
    903 tsdoctsysidsq(void)
    904 {
    905 	fprint(2, "tsdoctsysidsq not implemented\n");
    906 	tstate = TSDATA;
    907 }
    908 
    909 void
    910 tsdoctsysidafter(void)
    911 {
    912 	fprint(2, "tsdoctsysidafter not implemented\n");
    913 	tstate = TSDATA;
    914 }
    915 
    916 void
    917 tsdoctbogus(void)
    918 {
    919 	fprint(2, "tsdoctbogus not implemented\n");
    920 	tstate = TSDATA;
    921 }
    922 
    923 void
    924 tscdat(void)
    925 {
    926 	fprint(2, "tscdat not implemented\n");
    927 	tstate = TSDATA;
    928 }
    929 
    930 void
    931 tscdatbrk(void)
    932 {
    933 	fprint(2, "tscdatbrk not implemented\n");
    934 	tstate = TSDATA;
    935 }
    936 
    937 void
    938 tscdatend(void)
    939 {
    940 	fprint(2, "tscdatend not implemented\n");
    941 	tstate = TSDATA;
    942 }
    943 
    944 void
    945 tscref(void)
    946 {
    947 	if ((ALPHA(tc)) || (DIGIT(tc))) {
    948 		treconsume = 1;
    949 		tstate = TSNCREF;
    950 		return;
    951 	}
    952 	switch (tc) {
    953 	case '#':
    954 		s_putc(ctempbuf, tc);
    955 		tstate = TSNUMREF;
    956 		break;
    957 	default:
    958 		treconsume = 1;
    959 		s_terminate(ctempbuf);
    960 		s_append(cattr->value, s_to_c(ctempbuf));
    961 		s_reset(ctempbuf);
    962 		tstate = treturn;	
    963 	}
    964 	fprint(2, "tscref not implemented\n");
    965 	tstate = TSDATA;
    966 }
    967 
    968 void
    969 tsncref(void)
    970 {
    971 	fprint(2, "tsncref not implemented\n");
    972 	tstate = treturn;
    973 }
    974 
    975 void
    976 tsamam(void)
    977 {
    978 	fprint(2, "tsamam not implemented\n");
    979 	tstate = TSDATA;
    980 }
    981 
    982 void
    983 tsnumref(void)
    984 {
    985 	fprint(2, "tsnumref not implemented\n");
    986 	tstate = TSDATA;
    987 }
    988 
    989 void
    990 tshexrefstart(void)
    991 {
    992 	fprint(2, "tshexrefstart not implemented\n");
    993 	tstate = TSDATA;
    994 }
    995 
    996 void
    997 tsdecrefstart(void)
    998 {
    999 	fprint(2, "tsdecrefstart not implemented\n");
   1000 	tstate = TSDATA;
   1001 }
   1002 
   1003 void
   1004 tshexref(void)
   1005 {
   1006 	fprint(2, "tshexref not implemented\n");
   1007 	tstate = TSDATA;
   1008 }
   1009 
   1010 void
   1011 tsdecref(void)
   1012 {
   1013 	fprint(2, "tsdecref not implemented\n");
   1014 	tstate = TSDATA;
   1015 }
   1016 
   1017 void
   1018 tsnumrefend(void)
   1019 {
   1020 	fprint(2, "tsnumrefend not implemented\n");
   1021 	tstate = TSDATA;
   1022 }
   1023 
   1024 void
   1025 tsscriptendname(void)
   1026 {
   1027 	if (talpha(1) != 0) return;
   1028 	if (1 /* appropriate end tag token */) {
   1029 		switch (tc) {
   1030 		case '\t':
   1031 		case '\n':
   1032 		case '\r':
   1033 		case ' ':
   1034 			tstate = TSANAME_BEFORE;
   1035 			break;
   1036 		case '/':
   1037 			tstate = TSSCSTAG;
   1038 			break;
   1039 		case '>':
   1040 			tstate = TSDATA;
   1041 			break;
   1042 		}
   1043 	} else {
   1044 		temit(chartok('<'));
   1045 		temit(chartok('/'));
   1046 		temitbuf(ctempbuf);
   1047 	}
   1048 }
   1049 
   1050 
   1051 void
   1052 tsscriptescstart(void)
   1053 {
   1054 	if (tc == '-') {
   1055 		tstate = TSSCRIPT_ESC_START_DASH;
   1056 		temit(chartok('-'));
   1057 	} else {
   1058 		treconsume = 1;
   1059 		tstate = TSSCRIPT;
   1060 	}
   1061 }
   1062 
   1063 
   1064 void
   1065 tsscriptescstartdash(void)
   1066 {
   1067 	if (tc == '-') {
   1068 		tstate = TSSCRIPT_ESC_DDASH;
   1069 		temit(chartok('-'));
   1070 	} else {
   1071 		treconsume = 1;
   1072 		tstate = TSSCRIPT;
   1073 	}
   1074 }
   1075 
   1076 
   1077 void
   1078 tsscriptesc(void)
   1079 {
   1080 	switch (tc) {
   1081 	case '-':
   1082 		tstate = TSSCRIPT_ESC_DASH;
   1083 		temit(chartok('-'));
   1084 		break;
   1085 	case '<':
   1086 		tstate = TSSCRIPT_ESC_LESS;
   1087 		break;
   1088 	case '\0':
   1089 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1090 		temit(chartok(REPCHAR));
   1091 		break;
   1092 	case -1: /* EOF */
   1093 		fprint(2, "eof in scipt html comment like text parse error, tc='%c'\n", tc);
   1094 		temit(eoftok());
   1095 	default:
   1096 		temit(chartok(tc));
   1097 	}
   1098 }
   1099 
   1100 
   1101 void
   1102 tsscriptescdash(void)
   1103 {
   1104 	switch (tc) {
   1105 	case '-':
   1106 		tstate = TSSCRIPT_ESC_DDASH;
   1107 		temit(chartok('-'));
   1108 		break;
   1109 	case '<':
   1110 		tstate = TSSCRIPT_ESC_LESS;
   1111 		break;
   1112 	case '\0':
   1113 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1114 		tstate = TSSCRIPT_ESC;
   1115 		temit(chartok(REPCHAR));
   1116 		break;
   1117 	case -1:
   1118 		fprint(2, "eof in script html comment like text parse error, tc='%c'\n", tc);
   1119 		temit(eoftok());
   1120 		break;
   1121 	default:
   1122 		tstate = TSSCRIPT_ESC;
   1123 		temit(chartok(tc));
   1124 	}
   1125 }
   1126 
   1127 
   1128 void
   1129 tsscriptescddash(void)
   1130 {
   1131 	fprint(2, "tsscriptescddash not implemented\n");
   1132 	tstate = TSDATA;
   1133 }
   1134 
   1135 
   1136 void
   1137 tsscriptescless(void)
   1138 {
   1139 	fprint(2, "tsscriptescless not implemented\n");
   1140 	tstate = TSDATA;
   1141 }
   1142 
   1143 
   1144 void
   1145 tsscriptescendopen(void)
   1146 {
   1147 	fprint(2, "tsscriptescendopen not implemented\n");
   1148 	tstate = TSDATA;
   1149 }
   1150 
   1151 
   1152 void
   1153 tsscriptescendname(void)
   1154 {
   1155 	fprint(2, "tsscriptescendname not implemented\n");
   1156 	tstate = TSDATA;
   1157 }
   1158 
   1159 
   1160 void
   1161 tsscriptdescstart(void)
   1162 {
   1163 	fprint(2, "tsscriptdescstart not implemented\n");
   1164 	tstate = TSDATA;
   1165 }
   1166 
   1167 
   1168 void
   1169 tsscriptdesc(void)
   1170 {
   1171 	fprint(2, "tsscriptdesc not implemented\n");
   1172 	tstate = TSDATA;
   1173 }
   1174 
   1175 
   1176 void
   1177 tsscriptdescdash(void)
   1178 {
   1179 	fprint(2, "tsscriptdescdash not implemented\n");
   1180 	tstate = TSDATA;
   1181 }
   1182 
   1183 
   1184 void
   1185 tsscriptdescddash(void)
   1186 {
   1187 	fprint(2, "tsscriptdescddash not implemented\n");
   1188 	tstate = TSDATA;
   1189 }
   1190 
   1191 
   1192 void
   1193 tsscriptdescless(void)
   1194 {
   1195 	fprint(2, "tsscriptdescless not implemented\n");
   1196 	tstate = TSDATA;
   1197 }
   1198 
   1199 
   1200 void
   1201 tsscriptdescend(void)
   1202 {
   1203 	fprint(2, "tsscriptdescend not implemented\n");
   1204 	tstate = TSDATA;
   1205 }
   1206 
   1207 
   1208 
   1209 void
   1210 tsscriptendopen(void)
   1211 {
   1212 	if (ALPHA(tc) != 0) {
   1213 		treconsume = 1;
   1214 		tstate = TSSCIRPT_END_NAME;
   1215 	} else {
   1216 		temit(chartok('<'));
   1217 		temit(chartok('/'));
   1218 		treconsume = 1;
   1219 		tstate = TSDATA;
   1220 	}
   1221 }
   1222 
   1223 void
   1224 tsscriptless(void)
   1225 {
   1226 	switch (tc) {
   1227 	case '/':
   1228 		s_reset(ctempbuf);
   1229 		tstate = TSSCRIPT_END_OPEN;
   1230 		break;
   1231 	case '!':
   1232 		tstate = TSSCRIPT_ESC_START;
   1233 		temit(chartok('<'));
   1234 		temit(chartok('!'));
   1235 		break;
   1236 	default:
   1237 		temit(chartok('<'));
   1238 		treconsume = 1;
   1239 		tstate = TSSCRIPT;
   1240 	}		
   1241 }
   1242 
   1243 void
   1244 tsrawtendname(void)
   1245 {
   1246 	if (ALPHA(tc) != 0) {
   1247 		if (tc < 'a') tc+= 0x20;
   1248 		
   1249 	} else if (1 /* appropriate end tag token */ ) {
   1250 		switch (tc) {
   1251 		case '\t':
   1252 		case '\n':
   1253 		case '\r':
   1254 		case ' ':
   1255 			tstate = TSANAME_BEFORE;
   1256 			break;
   1257 		case '/':
   1258 			tstate = TSSCSTAG;
   1259 			break;
   1260 		case '>':
   1261 			tstate = TSDATA;
   1262 			break;
   1263 		}		
   1264 	} else {
   1265 		temit(chartok('<'));
   1266 		temit(chartok('/'));
   1267 		temitbuf(ctempbuf);
   1268 		treconsume = 1;
   1269 		tstate = TSRAWT;
   1270 	}
   1271 }
   1272 
   1273 void
   1274 tsrawtendopen(void)
   1275 {
   1276 	if (ALPHA(tc) != 0) {
   1277 		ctoken = newtok(TEND);
   1278 		treconsume = 1;
   1279 		tstate = TSRAWT;
   1280 	} else {
   1281 		temit(chartok('<'));
   1282 		temit(chartok('/'));
   1283 		treconsume = 1;
   1284 	}
   1285 }
   1286 
   1287 void
   1288 tsrawtless(void)
   1289 {
   1290 	if (tc == '/') {
   1291 		s_reset(ctempbuf);
   1292 		tstate = TSRAWT_END_OPEN;
   1293 	} else {
   1294 		temit(chartok('<'));
   1295 		treconsume = 1;
   1296 	}
   1297 }
   1298 
   1299 void
   1300 tsrcdtendname(void)
   1301 {
   1302 	if (talpha (1) != 0) return;
   1303 	if ( 1 /* appropriate end tag token ??? */) {
   1304 		switch (tc) {
   1305 		case '\t':
   1306 		case '\n':
   1307 		case '\r':
   1308 		case ' ':
   1309 			tstate = TSANAME_BEFORE;
   1310 			break;
   1311 		case '/':
   1312 			tstate = TSSCSTAG;
   1313 			break;
   1314 		case '>':
   1315 			tstate = TSDATA;
   1316 			temit(chartok(tc));
   1317 		}
   1318 	} else {
   1319 		temit(chartok('<'));
   1320 		temit(chartok('/'));
   1321 		temitbuf(ctempbuf);
   1322 		treconsume = 1;
   1323 		tstate = TSRCDT;
   1324 	}
   1325 }
   1326 
   1327 void
   1328 tsrcdtendopen(void)
   1329 {
   1330 	if (ALPHA(tc) != 0) {
   1331 		ctoken = newtok(TEND);
   1332 		treconsume = 1;
   1333 		tstate = TSRCDT_END_NAME;
   1334 	} else {
   1335 		treconsume = 1;
   1336 		temit(chartok('<'));
   1337 		temit(chartok('/'));
   1338 	}
   1339 }
   1340 
   1341 void
   1342 tsrcdtless(void)
   1343 {
   1344 	switch (tc) {
   1345 	case '/':
   1346 		s_reset(ctempbuf);
   1347 		tstate = TSRCDT_END_OPEN;
   1348 		break;
   1349 	default:
   1350 		treconsume = 1;
   1351 		temit(chartok('<'));
   1352 	}
   1353 }
   1354 
   1355 void
   1356 tstagname(void)
   1357 {
   1358 	switch (tc) {
   1359 	case '\t':
   1360 	case '\n':
   1361 	case '\r':
   1362 	case ' ':
   1363 		s_terminate(ctoken->name);
   1364 		tstate = TSANAME_BEFORE;
   1365 		break;
   1366 	case '/':
   1367 		s_terminate(ctoken->name);
   1368 		tstate = TSSCSTAG;
   1369 		break;
   1370 	case '>':
   1371 		s_terminate(ctoken->name);
   1372 		temit(ctoken);
   1373 		tstate = TSDATA;
   1374 		break;
   1375 	case '\0':
   1376 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1377 		s_putc(ctoken->name, REPCHAR);
   1378 		break;
   1379 	case -1:
   1380 		fprint(2, "eof in tag parse error, tc='%c'\n", tc);
   1381 		teof = 1;
   1382 		temit(eoftok());
   1383 		break;
   1384 	default:
   1385 		talpha(1);
   1386 	} 
   1387 }
   1388 
   1389 void
   1390 tsetagopen(void)
   1391 {
   1392 	if (ALPHA(tc) != 0) {
   1393 		ctoken = newtok(TEND);
   1394 		treconsume = 1;
   1395 		tstate = TSTAG_NAME;
   1396 	} else switch (tc) {
   1397 	case '>':
   1398 		fprint(2, "missing end tag name parse error, tc='%c'\n", tc);
   1399 		tstate = TSDATA;
   1400 		break;
   1401 	case -1:
   1402 		fprint(2, "eof before tag name parse error, tc='%c'\n", tc);
   1403 		temit(chartok('<'));
   1404 		teof = 1;
   1405 		temit(eoftok());
   1406 		break;
   1407 	default:
   1408 		fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc);
   1409 		ctoken = newtok(TCOMM);
   1410 		treconsume = 1;
   1411 		tstate = TSBOGUS_COMMENT;
   1412 	}
   1413 }
   1414 
   1415 void
   1416 tstagopen(void)
   1417 {
   1418 	if (ALPHA(tc) != 0) {
   1419 		ctoken = newtok(TSTART);
   1420 		treconsume = 1;
   1421 		tstate = TSTAG_NAME;
   1422 	} else switch (tc) {
   1423 	case '!':
   1424 		tstate = TSMKUP_OPEN;
   1425 		break;
   1426 	case '/':
   1427 		tstate = TSETAG_OPEN;
   1428 		break;
   1429 	case '?':
   1430 		fprint(2, "unexpected question mark instead of tag name parse error, tc='%c'\n", tc);
   1431 		ctoken = newtok(TCOMM);
   1432 		treconsume = 1;
   1433 		tstate = TSBOGUS_COMMENT;
   1434 		break;
   1435 	case -1:
   1436 		fprint(2, "eof before tag name parse error");
   1437 		temit(chartok('<'));
   1438 		teof = 1;
   1439 		temit(eoftok());
   1440 		break;
   1441 	default:
   1442 		fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc);
   1443 		temit(chartok('<'));
   1444 		treconsume = 1;
   1445 		tstate = TSDATA;
   1446 	}
   1447 }
   1448 
   1449 void
   1450 tsptxt(void)
   1451 {
   1452 	switch (tc) {
   1453 	case '\0':
   1454 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1455 		temit(chartok(REPCHAR)); 
   1456 		break;
   1457 	case -1: /* EOF */
   1458 		teof = 1;
   1459 		temit(eoftok());
   1460 		break;
   1461 	default:
   1462 		temit(chartok(tc));
   1463 	}
   1464 }
   1465 
   1466 void
   1467 tsscript(void)
   1468 {
   1469 	switch (tc) {
   1470 	case  '<':
   1471 		tstate = TSSCRIPT_LESS;
   1472 		break;
   1473 	case '\0':
   1474 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1475 		temit(chartok(REPCHAR));
   1476 		break;
   1477 	case -1: /* EOF */
   1478 		teof = 1;
   1479 		temit(eoftok());
   1480 		break;
   1481 	default:
   1482 		temit(chartok(tc));
   1483 	}
   1484 }
   1485 
   1486 void
   1487 tsrawt(void)
   1488 {
   1489 	switch (tc) {
   1490 	case  '<':
   1491 		tstate = TSRAWT_LESS;
   1492 		break;
   1493 	case '\0':
   1494 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1495 		temit(chartok(REPCHAR));
   1496 		break;
   1497 	case -1: /* EOF */
   1498 		teof = 1;
   1499 		temit(eoftok());
   1500 		break;
   1501 	default:
   1502 		temit(chartok(tc));
   1503 	}
   1504 }
   1505 
   1506 void
   1507 tsrcdt(void)
   1508 {
   1509 	switch (tc) {
   1510 	case '&':
   1511 		treturn = TSRCDT;
   1512 		tstate = TSCREF;
   1513 		break;
   1514 	case '<':
   1515 		tstate = TSRCDT_LESS;
   1516 		break;
   1517 	case '\0':
   1518 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1519 		temit(chartok(REPCHAR));
   1520 		break;
   1521 	case -1: /* EOF */
   1522 		teof = 1;
   1523 		temit(eoftok());
   1524 		break;
   1525 	default:
   1526 		temit(chartok(tc));
   1527 	}	
   1528 }
   1529 
   1530 void
   1531 tsdata(void)
   1532 {
   1533 	switch (tc) {
   1534 	case '&':
   1535 		treturn = TSDATA;
   1536 		tstate = TSCREF;
   1537 		break;
   1538 	case '<':
   1539 		tstate = TSTAG_OPEN;
   1540 		break;
   1541 	case '\0':
   1542 		fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
   1543 		temit(chartok(tc));
   1544 		break;
   1545 	case -1: /* EOF */
   1546 		teof = 1;
   1547 		temit(eoftok());
   1548 		break;
   1549 	default:
   1550 		temit(chartok(tc));
   1551 	}
   1552 }
   1553 
   1554 int
   1555 talpha(int tolower)
   1556 {
   1557 	if (ALPHA(tc) == 0) return 0;
   1558 	s_putc(ctempbuf, tc);
   1559 	if ((tolower != 0) && (tc < 'a')) tc+=0x20;
   1560 	s_putc(ctoken->name, tc);
   1561 	return 1;
   1562 }
   1563 
   1564 void
   1565 tconsume(void)
   1566 {
   1567 	char *buf;
   1568 	if (treconsume != 0) {
   1569 		treconsume = 0;
   1570 		return;
   1571 	}
   1572 	buf = s_to_c(clookaheadbuf);
   1573 	if (buf[0] != '\0') {
   1574 		tc = buf[0];
   1575 		print("tc = %uX\n", tc);
   1576 		/* TODO make this code utf-aware */
   1577 		String *shift;
   1578 		shift = s_copy(buf+1);
   1579 		s_free(clookaheadbuf);
   1580 		clookaheadbuf = shift;
   1581 	}
   1582 	else tc = gc();
   1583 }
   1584 
   1585 void
   1586 temitbuf(String *str)
   1587 {
   1588 	Rune r;
   1589 	char *buf;
   1590 	int n, len;
   1591 	buf = s_to_c(str);
   1592 	len = strlen(buf);
   1593 	for (n = 0; n < len; n += chartorune(&r, buf+n)){
   1594 		temit(chartok(r));
   1595 	}
   1596 	
   1597 }
   1598 
   1599 void
   1600 temit(Token *t)
   1601 {
   1602 	send(outchannel, &t);
   1603 }
   1604 
   1605 int
   1606 gc(void) /* getchar func name is reserved by stdio.h */
   1607 {
   1608 	#define GCBUF 1024
   1609 	static char buf[GCBUF], *bp=buf+1;
   1610 	static long n = 0;
   1611 	if (bp > buf+n-1){
   1612 		n = read(0, buf, GCBUF);
   1613 		if (n <= 0) return -1;
   1614 		bp = buf;
   1615 	}
   1616 	bp++;
   1617 	return *(bp-1);
   1618 }
   1619 
   1620 void
   1621 threadtokenize(void *v)
   1622 {
   1623 	Tokctl *tc;
   1624 	tc = v;
   1625 	outchannel = tc->c;
   1626 	teof = 0;
   1627 	threadsetname("tokenizer");
   1628 	ctempbuf = s_new();
   1629 	clookaheadbuf = s_new();
   1630 	while (teof == 0) {
   1631 		if (tstate >= TMAX) {
   1632 			fprint(2, "[TOKENIZER] unknown tstate %d\n", tstate);
   1633 			break;
   1634 		}
   1635 		tconsume();
   1636 		tstab[tstate]();
   1637 	}
   1638 }