tok.c (31763B)
1 #include <u.h> 2 #include <libc.h> 3 #include <String.h> 4 #include <thread.h> 5 6 #include "html5dom.h" 7 #include "ncref.h" 8 9 #define ALPHA(x) ((x >=0x41) && (x <= 0x7a)) 10 #define DIGIT(x) ((x >=0x30) && (x <= 0x39)) 11 12 Channel *outchannel; 13 14 int gc(void); 15 16 17 18 Token* 19 eoftok(void) 20 { 21 Token *t; 22 t = mallocz(sizeof(Token), 1); 23 t->type = TEOF; 24 return t; 25 } 26 27 Token* 28 chartok(Rune c) 29 { 30 Token *t; 31 t = mallocz(sizeof(Token), 1); 32 t->c = c; 33 t->type = TCHAR; 34 return t; 35 } 36 37 Token* 38 newtok(int type) 39 { 40 Token *nt; 41 nt = mallocz(sizeof(Token), 1); 42 nt->type = type; 43 nt->name = s_new(); 44 nt->attr = nil; 45 return nt; 46 } 47 48 void 49 t_free(Token *t) 50 { 51 s_free(t->name); 52 free(t); 53 } 54 55 56 Attr* 57 tnewattr(Token *t) 58 { 59 int n; 60 if (t->attr == nil) t->attr = mallocz(sizeof(Attr*), 1); 61 for (n=0; (t->attr)[n] != nil; n++); 62 t->attr = realloc(t->attr, (n + 1) * sizeof(Attr*)); 63 t->attr[n-1] = mallocz(sizeof(Attr), 1); 64 t->attr[n-1]->name = s_new(); 65 t->attr[n-1]->value = s_new(); 66 t->attr[n] = nil; 67 return t->attr[n-1]; 68 } 69 70 void 71 attr_free(Attr *attr) 72 { 73 s_free(attr->name); 74 s_free(attr->value); 75 free(attr); 76 } 77 78 u32int insertion_mode = IMinitial; 79 80 /* Tokenizer vars and funcs */ 81 82 Rune tc; 83 int treconsume = 0; 84 int teof; 85 86 Token *ctoken; 87 Attr *cattr; 88 String *ctempbuf; 89 String *clookaheadbuf; 90 91 void tconsume(void); 92 void temit(Token*); 93 void temitbuf(String*); 94 int talpha(int); 95 96 void tsdata(void); 97 void tsrcdt(void); 98 void tsrawt(void); 99 void tsscript(void); 100 void tsptxt(void); 101 void tstagopen(void); 102 void tsetagopen(void); 103 void tstagname(void); 104 void tsrcdtless(void); 105 void tsrcdtendopen(void); 106 void tsrcdtendname(void); 107 void tsrawtless(void); 108 void tsrawtendopen(void); 109 void tsrawtendname(void); 110 void tsscriptless(void); 111 void tsscriptendopen(void); 112 void tsscriptendname(void); 113 114 void tsscriptescstart(void); 115 void tsscriptescstartdash(void); 116 void tsscriptesc(void); 117 void tsscriptescdash(void); 118 void tsscriptescddash(void); 119 void tsscriptescless(void); 120 void tsscriptescendopen(void); 121 void tsscriptescendname(void); 122 void tsscriptdescstart(void); 123 void tsscriptdesc(void); 124 void tsscriptdescdash(void); 125 void tsscriptdescddash(void); 126 void tsscriptdescless(void); 127 void tsscriptdescend(void); 128 129 void tsanamebefore(void); 130 void tsaname(void); 131 void tsanameafter(void); 132 void tsavalbefore(void); 133 void tsavaldq(void); 134 void tsavalsq(void); 135 void tsavaluq(void); 136 void tsavalafter(void); 137 void tsscstag(void); 138 void tsboguscomment(void); 139 void tsmkupopen(void); 140 void tscommentstart(void); 141 void tscommentstartdash(void); 142 void tscomment(void); 143 void tscommentless(void); 144 void tscommentlessbang(void); 145 void tscommentlessbangdash(void); 146 void tscommentlessbangddash(void); 147 void tscommentenddash(void); 148 void tscommentend(void); 149 void tscommentendbang(void); 150 void tsdoct(void); 151 void tsdoctbefore(void); 152 void tsdoctname(void); 153 void tsdoctnameafter(void); 154 void tsdoctpubkafter(void); 155 void tsdoctpubidbefore(void); 156 void tsdoctpubiddq(void); 157 void tsdoctpubidsq(void); 158 void tsdoctpubidafter(void); 159 void tsdoctbetween(void); 160 void tsdoctsyskafter(void); 161 void tsdoctsysidbefore(void); 162 void tsdoctsysiddq(void); 163 void tsdoctsysidsq(void); 164 void tsdoctsysidafter(void); 165 void tsdoctbogus(void); 166 void tscdat(void); 167 void tscdatbrk(void); 168 void tscdatend(void); 169 void tscref(void); 170 void tsncref(void); 171 void tsamam(void); 172 void tsnumref(void); 173 void tshexrefstart(void); 174 void tsdecrefstart(void); 175 void tshexref(void); 176 void tsdecref(void); 177 void tsnumrefend(void); 178 179 180 #define REPCHAR Runeerror /* replacement character */ 181 182 enum { 183 TSDATA, /* data */ 184 TSRCDT, /* RCDATA */ 185 TSRAWT, /* RAWTEXT */ 186 TSSCRIPT, /* script data */ 187 TSPTXT, /* PLAINTEXT */ 188 TSTAG_OPEN, /* tag open */ 189 TSETAG_OPEN, /* end tag open */ 190 TSTAG_NAME, /* tag name */ 191 TSRCDT_LESS, /* RCDATA less-than sign */ 192 TSRCDT_END_OPEN, /* RCDATA end tag open */ 193 TSRCDT_END_NAME, /* RCDATA end tag name */ 194 TSRAWT_LESS, /* RAWTEXT less-than sign */ 195 TSRAWT_END_OPEN, /* RAWTEXT end tag open */ 196 TSRAWT_END_NAME, /* RAWTEXT end tag name */ 197 TSSCRIPT_LESS, /* script data less-than sign */ 198 TSSCRIPT_END_OPEN, /* script data end tag open */ 199 TSSCIRPT_END_NAME, /* script data end tag name */ 200 TSSCRIPT_ESC_START, /* scirpt data escape start */ 201 TSSCRIPT_ESC_START_DASH, /* scirpt data escape start dash */ 202 TSSCRIPT_ESC, /* scirpt data escaped */ 203 TSSCRIPT_ESC_DASH, /* scirpt data escaped dash */ 204 205 TSSCRIPT_ESC_DDASH, /* scirpt data escaped dash dash */ 206 TSSCRIPT_ESC_LESS, /* scirpt data escaped less-than sign */ 207 TSSCRIPT_ESC_END_OPEN, /* scirpt data escaped end tag open */ 208 TSSCRIPT_ESC_END_NAME, /* scirpt data escaped end tag name */ 209 TSSCRIPT_DESC_START, /* scirpt data double escape start */ 210 TSSCRIPT_DESC, /* scirpt data double escaped */ 211 TSSCRIPT_DESC_DASH, /* scirpt data double escaped dash */ 212 TSSCRIPT_DESC_DDASH, /* scirpt data double escaped dash dash */ 213 TSSCRIPT_DESC_LESS, /* scirpt data double escaped less-than sign */ 214 TSSCRIPT_DESC_END, /* scirpt data double escape end */ 215 216 TSANAME_BEFORE, /* Before attribute name */ 217 TSANAME, /* Attribute name */ 218 TSANAME_AFTER, /* After attribute name */ 219 TSAVAL_BEFORE, /* Before attribute value */ 220 TSAVAL_DQ, /* Attribute value (double-quoted) */ 221 TSAVAL_SQ, /* Attribute value (single-quoted) */ 222 TSAVAL_UQ, /* Attribute value (unquoted) */ 223 TSAVAL_AFTER, /* After attribute value (quoted) */ 224 225 TSSCSTAG, /* Self-closing start tag */ 226 TSBOGUS_COMMENT, /* Bogus comment */ 227 TSMKUP_OPEN, /* Markup declaration open */ 228 229 TSCOMMENT_START, /* Comment start */ 230 TSCOMMENT_START_DASH, /* Comment start dash */ 231 TSCOMMENT, /* Comment */ 232 TSCOMMENT_LESS, /* Comment less-than sign */ 233 TSCOMMENT_LESS_BANG, /* Comment less-than sign bang */ 234 TSCOMMENT_LESS_BANG_DASH, /* Comment less-than sign bang dash */ 235 TSCOMMENT_LESS_BANG_DDASH, /* Comment less-than sign bang dash dash */ 236 TSCOMMENT_END_DASH, /* Comment end dash */ 237 TSCOMMENT_END, /* Comment end */ 238 TSCOMMENT_END_BANG, /* Comment end bang */ 239 240 TSDOCT, /* DOCTYPE */ 241 TSDOCT_BEFORE, /* Before DOCTYPE name */ 242 TSDOCT_NAME, /* DOCTYPE name */ 243 TSDOCT_NAME_AFTER, /* After DOCTYPE name */ 244 TSDOCT_PUBK_AFTER, /* After DOCTYPE public keyword */ 245 TSDOCT_PUBID_BEFORE, /* Before DOCTYPE public identifier */ 246 TSDOCT_PUBID_DQ, /* DOCTYPE public identifier (double-quoted) */ 247 TSDOCT_PUBID_SQ, /* DOCTYPE public identifier (single-quoted) */ 248 TSDOCT_PUBID_AFTER, /* After DOCTYPE public identifier */ 249 TSDOCT_BETWEEN, /* Between DOCTYPE public and system identifiers */ 250 TSDOCT_SYSK_AFTER, /* After DOCTYPE system keyword */ 251 TSDOCT_SYSID_BEFORE, /* Before DOCTYPE system identifier */ 252 TSDOCT_SYSID_DQ, /* DOCTYPE system identifier (double-quoted) */ 253 TSDOCT_SYSID_SQ, /* DOCTYPE system identifier (single-quoted) */ 254 TSDOCT_SYSID_AFTER, /* After DOCTYPE system identifier */ 255 TSDOCT_BOGUS, /* Bogus DOCTYPE */ 256 257 TSCDAT, /* CDATA section */ 258 TSCDAT_BRK, /* CDATA section bracket */ 259 TSCDAT_END, /* CDATA section end */ 260 261 TSCREF, /* Character reference */ 262 TSNCREF, /* Named character reference */ 263 TSAMAM, /* Ambiguous ampersand */ 264 TSNUMREF, /* Numeric character reference */ 265 TSHEXREF_START, /* Hexadecimal character reference start */ 266 TSDECREF_START, /* Decimal character reference start */ 267 TSHEXREF, /* Hexadecimal character reference */ 268 TSDECREF, /* Decimal character reference */ 269 TSNUMREF_END, /* Numeric character reference end */ 270 271 TMAX, 272 }; 273 274 void (*tstab[])(void) = { 275 [TSDATA] = tsdata, 276 [TSRCDT] = tsrcdt, 277 [TSRAWT] = tsrawt, 278 [TSSCRIPT] = tsscript, 279 [TSPTXT] = tsptxt, 280 [TSTAG_OPEN] = tstagopen, 281 [TSETAG_OPEN] = tsetagopen, 282 [TSTAG_NAME] = tstagname, 283 [TSRCDT_LESS] = tsrcdtless, 284 [TSRCDT_END_OPEN] = tsrcdtendopen, 285 [TSRCDT_END_NAME] = tsrcdtendname, 286 [TSRAWT_LESS] = tsrawtless, 287 [TSRAWT_END_OPEN] = tsrawtendopen, 288 [TSSCRIPT_LESS] = tsscriptless, 289 [TSSCRIPT_END_OPEN] = tsscriptendopen, 290 [TSSCIRPT_END_NAME] = tsscriptendname, 291 [TSSCRIPT_ESC_START] = tsscriptesc, 292 [TSSCRIPT_ESC_START_DASH] = tsscriptesc, 293 [TSSCRIPT_ESC] = tsscriptesc, 294 [TSSCRIPT_ESC_DASH] = tsscriptescdash, 295 [TSSCRIPT_ESC_DDASH] = tsscriptescddash, 296 [TSSCRIPT_ESC_LESS] = tsscriptescless, 297 [TSSCRIPT_ESC_END_OPEN] = tsscriptescendopen, 298 [TSSCRIPT_ESC_END_NAME] = tsscriptescendname, 299 [TSSCRIPT_DESC_START] = tsscriptdescstart, 300 [TSSCRIPT_DESC] = tsscriptdesc, 301 [TSSCRIPT_DESC_DASH] = tsscriptdescdash, 302 [TSSCRIPT_DESC_DDASH] = tsscriptdescddash, 303 [TSSCRIPT_DESC_LESS] = tsscriptdescless, 304 [TSSCRIPT_DESC_END] = tsscriptdescend, 305 306 [TSANAME_BEFORE] = tsanamebefore, 307 [TSANAME] = tsaname, 308 [TSANAME_AFTER] = tsanameafter, 309 [TSAVAL_BEFORE] = tsavalbefore, 310 [TSAVAL_DQ] = tsavaldq, 311 [TSAVAL_SQ] = tsavalsq, 312 [TSAVAL_UQ] = tsavaluq, 313 [TSAVAL_AFTER] = tsavalafter, 314 [TSSCSTAG] = tsscstag, 315 [TSBOGUS_COMMENT] = tsboguscomment, 316 [TSMKUP_OPEN] = tsmkupopen, 317 [TSCOMMENT_START] = tscommentstart, 318 [TSCOMMENT_START_DASH] = tscommentstartdash, 319 [TSCOMMENT] = tscomment, 320 [TSCOMMENT_LESS] = tscommentless, 321 [TSCOMMENT_LESS_BANG] = tscommentlessbang, 322 [TSCOMMENT_LESS_BANG_DASH] = tscommentlessbangdash, 323 [TSCOMMENT_LESS_BANG_DDASH] = tscommentlessbangddash, 324 [TSCOMMENT_END_DASH] = tscommentenddash, 325 [TSCOMMENT_END] = tscommentend, 326 [TSCOMMENT_END_BANG] = tscommentendbang, 327 [TSDOCT] = tsdoct, 328 [TSDOCT_BEFORE] = tsdoctbefore, 329 [TSDOCT_NAME] = tsdoctname, 330 [TSDOCT_NAME_AFTER] = tsdoctnameafter, 331 [TSDOCT_PUBK_AFTER] = tsdoctpubkafter, 332 [TSDOCT_PUBID_BEFORE] = tsdoctpubidbefore, 333 [TSDOCT_PUBID_DQ] = tsdoctpubiddq, 334 [TSDOCT_PUBID_SQ] = tsdoctpubidsq, 335 [TSDOCT_PUBID_AFTER] = tsdoctpubidafter, 336 [TSDOCT_BETWEEN] = tsdoctbetween, 337 [TSDOCT_SYSK_AFTER] = tsdoctsyskafter, 338 [TSDOCT_SYSID_BEFORE] = tsdoctsysidbefore, 339 [TSDOCT_SYSID_DQ] = tsdoctsysiddq, 340 [TSDOCT_SYSID_SQ] = tsdoctsysidsq, 341 [TSDOCT_SYSID_AFTER] = tsdoctsysidafter, 342 [TSDOCT_BOGUS] = tsdoctbogus, 343 [TSCDAT] = tscdat, 344 [TSCDAT_BRK] = tscdatbrk, 345 [TSCDAT_END] = tscdatend, 346 [TSCREF] = tscref, 347 [TSNCREF] = tsncref, 348 [TSAMAM] = tsamam, 349 [TSNUMREF] = tsnumref, 350 [TSHEXREF_START] = tshexrefstart, 351 [TSDECREF_START] = tsdecrefstart, 352 [TSHEXREF] = tshexref, 353 [TSDECREF] = tsdecref, 354 [TSNUMREF_END] = tsnumrefend, 355 }; 356 357 int tstate = TSDATA; 358 int treturn = -1; 359 360 void 361 tsanamebefore(void) 362 { 363 switch (tc) { 364 case '\t': 365 case '\n': 366 case '\r': 367 case ' ': 368 break; 369 case '/': 370 case '>': 371 case -1: 372 treconsume = 1; 373 tstate = TSANAME_AFTER; 374 break; 375 case '=': 376 fprint(2, "unexpected equals sign before attribute name parse error, tc='%c'\n", tc); 377 cattr = tnewattr(ctoken); 378 s_putc(cattr->name, tc); 379 tstate = TSANAME; 380 break; 381 default: 382 cattr = tnewattr(ctoken); 383 treconsume = 1; 384 tstate = TSANAME; 385 } 386 } 387 388 void 389 tsaname(void) 390 { 391 if (ALPHA(tc) != 0) { 392 if (tc < 'a') tc += 0x20; 393 } 394 switch (tc) { 395 case '\t': 396 case '\n': 397 case '\r': 398 case ' ': 399 case '/': 400 case '>': 401 case -1: 402 treconsume = 1; 403 s_terminate(cattr->name); 404 tstate = TSANAME_AFTER; 405 break; 406 case '=': 407 tstate = TSAVAL_BEFORE; 408 break; 409 case '\0': 410 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 411 s_putc(cattr->name, REPCHAR); 412 break; 413 case '"': 414 case '\'': 415 case '<': 416 fprint(2, "unexpected character in attribute name parse error, tc='%c'\n", tc); 417 default: 418 s_putc(cattr->name, tc); 419 } 420 /* TODO check for duplicate attribute names on leaving or emitting */ 421 } 422 423 void 424 tsanameafter(void) 425 { 426 switch (tc) { 427 case '\t': 428 case '\n': 429 case '\r': 430 case ' ': 431 break; 432 case '/': 433 tstate = TSSCSTAG; 434 break; 435 case '=': 436 tstate = TSAVAL_BEFORE; 437 break; 438 case '>': 439 tstate = TSDATA; 440 s_terminate(ctoken->name); 441 temit(ctoken); 442 break; 443 case -1: /* EOF */ 444 fprint(2, "eof in tag parse error\n"); 445 temit(eoftok()); 446 break; 447 default: 448 cattr = tnewattr(ctoken); 449 treconsume = 1; 450 tstate = TSANAME; 451 } 452 } 453 454 void 455 tsavalbefore(void) 456 { 457 switch (tc) { 458 case '\t': 459 case '\n': 460 case '\r': 461 case ' ': 462 break; 463 case '"': 464 tstate = TSAVAL_DQ; 465 break; 466 case '\'': 467 tstate = TSAVAL_SQ; 468 break; 469 case '>': 470 fprint(2, "missing attribute value parse error\n"); 471 s_terminate(ctoken->name); 472 temit(ctoken); 473 tstate = TSDATA; 474 break; 475 default: 476 treconsume = 1; 477 tstate = TSAVAL_UQ; 478 } 479 } 480 481 void 482 tsavaldq(void) 483 { 484 switch (tc) { 485 case '"': 486 tstate = TSAVAL_AFTER; 487 break; 488 case '&': 489 treturn = TSAVAL_DQ; 490 tstate = TSCREF; 491 break; 492 case '\0': 493 fprint(2, "unexpected null character parse error\n"); 494 s_putc(cattr->value, REPCHAR); 495 break; 496 case -1: /* EOF */ 497 fprint(2, "oef in tag parse error\n"); 498 temit(eoftok()); 499 break; 500 default: 501 s_putc(cattr->value, tc); 502 } 503 } 504 505 void 506 tsavalsq(void) 507 { 508 switch (tc) { 509 case '\'': 510 tstate = TSAVAL_AFTER; 511 break; 512 case '&': 513 treturn = TSAVAL_SQ; 514 tstate = TSCREF; 515 break; 516 case '\0': 517 fprint(2, "unexpected null character parse error\n"); 518 s_putc(cattr->value, REPCHAR); 519 break; 520 case -1: /* EOF */ 521 fprint(2, "oef in tag parse error\n"); 522 temit(eoftok()); 523 break; 524 default: 525 s_putc(cattr->value, tc); 526 } 527 } 528 529 void 530 tsavaluq(void) 531 { 532 switch (tc) { 533 case '\t': 534 case '\n': 535 case '\r': 536 case ' ': 537 s_terminate(cattr->value); 538 tstate = TSANAME_BEFORE; 539 break; 540 case '&': 541 treturn = TSAVAL_UQ; 542 tstate = TSCREF; 543 break; 544 case '>': 545 s_terminate(ctoken->name); 546 s_terminate(cattr->value); 547 tstate = TSDATA; 548 break; 549 case '\0': 550 fprint(2, "unexpected null character parse error\n"); 551 s_putc(cattr->value, REPCHAR); 552 break; 553 case -1: /* EOF */ 554 fprint(2, "oef in tag parse error\n"); 555 temit(eoftok()); 556 break; case '"': 557 case '\'': 558 case '<': 559 case '=': 560 case '`': 561 fprint(2, "unexpected character in unquoted attribute value parse error\n"); 562 default: 563 s_putc(cattr->value, tc); 564 } 565 } 566 567 void 568 tsavalafter(void) 569 { 570 switch (tc) { 571 case '\t': 572 case '\n': 573 case '\r': 574 case ' ': 575 tstate = TSANAME_BEFORE; 576 break; 577 case '/': 578 ctoken->flags |= TSSCSTAG; 579 break; 580 case '>': 581 s_terminate(ctoken->name); 582 s_terminate(cattr->value); 583 temit(ctoken); 584 tstate = TSDATA; 585 break; 586 case -1: /* EOF */ 587 fprint(2, "eof in tag parse error\n"); 588 temit(eoftok()); 589 break; 590 default: 591 fprint(2, "missing whitespace between attributes parse error\n"); 592 treconsume = 1; 593 tstate = TSANAME_BEFORE; 594 } 595 } 596 597 void 598 tsscstag(void) 599 { 600 switch (tc) { 601 case '>': 602 ctoken->flags |= TF_SELF_CLOSING; 603 tstate = TSDATA; 604 temit(ctoken); 605 break; 606 case -1: 607 fprint(2, "eof in tag parse error\n"); 608 temit(eoftok()); 609 break; 610 default: 611 fprint(2, "unxpected solidus in tag parse error\n"); 612 treconsume = 1; 613 tstate = TSANAME_BEFORE; 614 } 615 } 616 617 void 618 tsboguscomment(void) 619 { 620 fprint(2, "tsboguscomment not implemented\n"); 621 tstate = TSDATA; 622 } 623 624 void 625 tsmkupopen(void) 626 { 627 int i; 628 String *mbuf, *lowered; 629 mbuf = s_new(); 630 s_putc(mbuf, tc); 631 tconsume(); 632 s_putc(mbuf, tc); 633 if (strncmp(s_to_c(mbuf), "--", 2) == 0) { 634 ctoken = newtok(TCOMM); 635 tstate = TSCOMMENT_START; 636 s_free(mbuf); 637 return; 638 } 639 for (i = 0; i < 5; i++) { 640 tconsume(); 641 s_putc(mbuf, tc); 642 } 643 if (strncmp(s_to_c(mbuf), "[CDATA[", 7) == 0) { 644 /* TODO: check if adjusted current node */ 645 tstate = TSCDAT; 646 s_free(mbuf); 647 return; 648 } 649 lowered = s_copy(s_to_c(mbuf)); 650 s_tolower(lowered); 651 if (strncmp(s_to_c(lowered), "doctype", 7) == 0) { 652 tstate = TSDOCT; 653 s_free(mbuf); 654 s_free(lowered); 655 return; 656 } 657 fprint(2, "incorrectly opened comment parse error, tc='%c'\n", tc); 658 ctoken = newtok(TCOMM); 659 tstate = TSBOGUS_COMMENT; 660 s_append(clookaheadbuf, s_to_c(mbuf)); 661 s_free(lowered); 662 s_free(mbuf); 663 } 664 665 void 666 tscommentstart(void) 667 { 668 fprint(2, "tscommentstart not implemented\n"); 669 tstate = TSDATA; 670 } 671 672 void 673 tscommentstartdash(void) 674 { 675 fprint(2, "tscommentstartdash not implemented\n"); 676 tstate = TSDATA; 677 } 678 679 void 680 tscomment(void) 681 { 682 fprint(2, "tscomment not implemented\n"); 683 tstate = TSDATA; 684 } 685 686 void 687 tscommentless(void) 688 { 689 fprint(2, "tscommentless not implemented\n"); 690 tstate = TSDATA; 691 } 692 693 void 694 tscommentlessbang(void) 695 { 696 fprint(2, "tscommentlessbang not implemented\n"); 697 tstate = TSDATA; 698 } 699 700 void 701 tscommentlessbangdash(void) 702 { 703 fprint(2, "tscommentlessbangdash not implemented\n"); 704 tstate = TSDATA; 705 } 706 707 void 708 tscommentlessbangddash(void) 709 { 710 fprint(2, "tscommentlessbangddash not implemented\n"); 711 tstate = TSDATA; 712 } 713 714 void 715 tscommentenddash(void) 716 { 717 fprint(2, "tscommentenddash not implemented\n"); 718 tstate = TSDATA; 719 } 720 721 void 722 tscommentend(void) 723 { 724 fprint(2, "tscommentend not implemented\n"); 725 tstate = TSDATA; 726 } 727 728 void 729 tscommentendbang(void) 730 { 731 fprint(2, "tscommentendbang not implemented\n"); 732 tstate = TSDATA; 733 } 734 735 void 736 tsdoct(void) 737 { 738 switch (tc) { 739 case '\t': 740 case '\n': 741 case '\r': 742 case ' ': 743 tstate = TSDOCT_BEFORE; 744 break; 745 case '>': 746 treconsume = 1; 747 tstate = TSDOCT_BEFORE; 748 break; 749 case -1: /* eof */ 750 fprint(2, "eof in doctype parse error, tc='%c'\n", tc); 751 ctoken = newtok(TDOCT); 752 ctoken->flags |= TF_FORCE_QUIRKS; 753 s_terminate(ctoken->name); 754 temit(ctoken); 755 break; 756 default: 757 fprint(2, "missing whitespace before doctype name parse error, tc='%c'\n", tc); 758 treconsume = 1; 759 tstate = TSDOCT_BEFORE; 760 } 761 } 762 763 void 764 tsdoctbefore(void) 765 { 766 switch (tc) { 767 case '\t': 768 case '\n': 769 case '\r': 770 case ' ': 771 break; 772 case '\0': 773 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 774 ctoken = newtok(TDOCT); 775 s_putc(ctoken->name, REPCHAR); 776 tstate = TSDOCT_NAME; 777 break; 778 case '>': 779 fprint(2, "missing doctype name parse error, tc='%c'\n", tc); 780 ctoken = newtok(TDOCT); 781 ctoken->flags |= TF_FORCE_QUIRKS; 782 s_terminate(ctoken->name); 783 temit(ctoken); 784 break; 785 case -1: /* EOF */ 786 fprint(2, "eof in doctype parse error, tc='%c'\n", tc); 787 ctoken = newtok(TDOCT); 788 ctoken->flags |= TF_FORCE_QUIRKS; 789 s_terminate(ctoken->name); 790 temit(ctoken); 791 temit(eoftok()); 792 break; 793 default: 794 if (tc < 'a') tc += 0x20; 795 ctoken = newtok(TDOCT); 796 s_putc(ctoken->name, tc); 797 tstate = TSDOCT_NAME; 798 } 799 } 800 801 void 802 tsdoctname(void) 803 { 804 switch (tc) { 805 case '\t': 806 case '\n': 807 case '\r': 808 case ' ': 809 tstate = TSDOCT_NAME_AFTER; 810 break; 811 case '>': 812 tstate = TSDATA; 813 s_terminate(ctoken->name); 814 temit(ctoken); 815 break; 816 case '\0': 817 fprint(2, "unexpected null character parse error\n"); 818 s_putc(ctoken->name, REPCHAR); 819 break; 820 case -1: /* EOF */ 821 fprint(2, "eof in doctype parse error\n"); 822 ctoken->flags |= TF_FORCE_QUIRKS; 823 s_terminate(ctoken->name); 824 temit(ctoken); 825 temit(eoftok()); 826 break; 827 default: 828 talpha(1); 829 } 830 } 831 832 void 833 tsdoctnameafter(void) 834 { 835 fprint(2, "tsdoctnameafter not implemented\n"); 836 tstate = TSDATA; 837 } 838 839 void 840 tsdoctpubkafter(void) 841 { 842 fprint(2, "tsdoctpubkafter not implemented\n"); 843 tstate = TSDATA; 844 } 845 846 void 847 tsdoctpubidbefore(void) 848 { 849 fprint(2, "tsdoctpubidbefore not implemented\n"); 850 tstate = TSDATA; 851 } 852 853 void 854 tsdoctpubiddq(void) 855 { 856 fprint(2, "tsdoctpubiddq not implemented\n"); 857 tstate = TSDATA; 858 } 859 860 void 861 tsdoctpubidsq(void) 862 { 863 fprint(2, "tsdoctpubidsq not implemented\n"); 864 tstate = TSDATA; 865 } 866 867 void 868 tsdoctpubidafter(void) 869 { 870 fprint(2, "tsdoctpubidafter not implemented\n"); 871 tstate = TSDATA; 872 } 873 874 void 875 tsdoctbetween(void) 876 { 877 fprint(2, "tsdoctbetween not implemented\n"); 878 tstate = TSDATA; 879 } 880 881 void 882 tsdoctsyskafter(void) 883 { 884 fprint(2, "tsdoctsyskafter not implemented\n"); 885 tstate = TSDATA; 886 } 887 888 void 889 tsdoctsysidbefore(void) 890 { 891 fprint(2, "tsdoctsysidbefore not implemented\n"); 892 tstate = TSDATA; 893 } 894 895 void 896 tsdoctsysiddq(void) 897 { 898 fprint(2, "tsdoctsysiddq not implemented\n"); 899 tstate = TSDATA; 900 } 901 902 void 903 tsdoctsysidsq(void) 904 { 905 fprint(2, "tsdoctsysidsq not implemented\n"); 906 tstate = TSDATA; 907 } 908 909 void 910 tsdoctsysidafter(void) 911 { 912 fprint(2, "tsdoctsysidafter not implemented\n"); 913 tstate = TSDATA; 914 } 915 916 void 917 tsdoctbogus(void) 918 { 919 fprint(2, "tsdoctbogus not implemented\n"); 920 tstate = TSDATA; 921 } 922 923 void 924 tscdat(void) 925 { 926 fprint(2, "tscdat not implemented\n"); 927 tstate = TSDATA; 928 } 929 930 void 931 tscdatbrk(void) 932 { 933 fprint(2, "tscdatbrk not implemented\n"); 934 tstate = TSDATA; 935 } 936 937 void 938 tscdatend(void) 939 { 940 fprint(2, "tscdatend not implemented\n"); 941 tstate = TSDATA; 942 } 943 944 void 945 tscref(void) 946 { 947 if ((ALPHA(tc)) || (DIGIT(tc))) { 948 treconsume = 1; 949 tstate = TSNCREF; 950 return; 951 } 952 switch (tc) { 953 case '#': 954 s_putc(ctempbuf, tc); 955 tstate = TSNUMREF; 956 break; 957 default: 958 treconsume = 1; 959 s_terminate(ctempbuf); 960 s_append(cattr->value, s_to_c(ctempbuf)); 961 s_reset(ctempbuf); 962 tstate = treturn; 963 } 964 fprint(2, "tscref not implemented\n"); 965 tstate = TSDATA; 966 } 967 968 void 969 tsncref(void) 970 { 971 fprint(2, "tsncref not implemented\n"); 972 tstate = treturn; 973 } 974 975 void 976 tsamam(void) 977 { 978 fprint(2, "tsamam not implemented\n"); 979 tstate = TSDATA; 980 } 981 982 void 983 tsnumref(void) 984 { 985 fprint(2, "tsnumref not implemented\n"); 986 tstate = TSDATA; 987 } 988 989 void 990 tshexrefstart(void) 991 { 992 fprint(2, "tshexrefstart not implemented\n"); 993 tstate = TSDATA; 994 } 995 996 void 997 tsdecrefstart(void) 998 { 999 fprint(2, "tsdecrefstart not implemented\n"); 1000 tstate = TSDATA; 1001 } 1002 1003 void 1004 tshexref(void) 1005 { 1006 fprint(2, "tshexref not implemented\n"); 1007 tstate = TSDATA; 1008 } 1009 1010 void 1011 tsdecref(void) 1012 { 1013 fprint(2, "tsdecref not implemented\n"); 1014 tstate = TSDATA; 1015 } 1016 1017 void 1018 tsnumrefend(void) 1019 { 1020 fprint(2, "tsnumrefend not implemented\n"); 1021 tstate = TSDATA; 1022 } 1023 1024 void 1025 tsscriptendname(void) 1026 { 1027 if (talpha(1) != 0) return; 1028 if (1 /* appropriate end tag token */) { 1029 switch (tc) { 1030 case '\t': 1031 case '\n': 1032 case '\r': 1033 case ' ': 1034 tstate = TSANAME_BEFORE; 1035 break; 1036 case '/': 1037 tstate = TSSCSTAG; 1038 break; 1039 case '>': 1040 tstate = TSDATA; 1041 break; 1042 } 1043 } else { 1044 temit(chartok('<')); 1045 temit(chartok('/')); 1046 temitbuf(ctempbuf); 1047 } 1048 } 1049 1050 1051 void 1052 tsscriptescstart(void) 1053 { 1054 if (tc == '-') { 1055 tstate = TSSCRIPT_ESC_START_DASH; 1056 temit(chartok('-')); 1057 } else { 1058 treconsume = 1; 1059 tstate = TSSCRIPT; 1060 } 1061 } 1062 1063 1064 void 1065 tsscriptescstartdash(void) 1066 { 1067 if (tc == '-') { 1068 tstate = TSSCRIPT_ESC_DDASH; 1069 temit(chartok('-')); 1070 } else { 1071 treconsume = 1; 1072 tstate = TSSCRIPT; 1073 } 1074 } 1075 1076 1077 void 1078 tsscriptesc(void) 1079 { 1080 switch (tc) { 1081 case '-': 1082 tstate = TSSCRIPT_ESC_DASH; 1083 temit(chartok('-')); 1084 break; 1085 case '<': 1086 tstate = TSSCRIPT_ESC_LESS; 1087 break; 1088 case '\0': 1089 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1090 temit(chartok(REPCHAR)); 1091 break; 1092 case -1: /* EOF */ 1093 fprint(2, "eof in scipt html comment like text parse error, tc='%c'\n", tc); 1094 temit(eoftok()); 1095 default: 1096 temit(chartok(tc)); 1097 } 1098 } 1099 1100 1101 void 1102 tsscriptescdash(void) 1103 { 1104 switch (tc) { 1105 case '-': 1106 tstate = TSSCRIPT_ESC_DDASH; 1107 temit(chartok('-')); 1108 break; 1109 case '<': 1110 tstate = TSSCRIPT_ESC_LESS; 1111 break; 1112 case '\0': 1113 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1114 tstate = TSSCRIPT_ESC; 1115 temit(chartok(REPCHAR)); 1116 break; 1117 case -1: 1118 fprint(2, "eof in script html comment like text parse error, tc='%c'\n", tc); 1119 temit(eoftok()); 1120 break; 1121 default: 1122 tstate = TSSCRIPT_ESC; 1123 temit(chartok(tc)); 1124 } 1125 } 1126 1127 1128 void 1129 tsscriptescddash(void) 1130 { 1131 fprint(2, "tsscriptescddash not implemented\n"); 1132 tstate = TSDATA; 1133 } 1134 1135 1136 void 1137 tsscriptescless(void) 1138 { 1139 fprint(2, "tsscriptescless not implemented\n"); 1140 tstate = TSDATA; 1141 } 1142 1143 1144 void 1145 tsscriptescendopen(void) 1146 { 1147 fprint(2, "tsscriptescendopen not implemented\n"); 1148 tstate = TSDATA; 1149 } 1150 1151 1152 void 1153 tsscriptescendname(void) 1154 { 1155 fprint(2, "tsscriptescendname not implemented\n"); 1156 tstate = TSDATA; 1157 } 1158 1159 1160 void 1161 tsscriptdescstart(void) 1162 { 1163 fprint(2, "tsscriptdescstart not implemented\n"); 1164 tstate = TSDATA; 1165 } 1166 1167 1168 void 1169 tsscriptdesc(void) 1170 { 1171 fprint(2, "tsscriptdesc not implemented\n"); 1172 tstate = TSDATA; 1173 } 1174 1175 1176 void 1177 tsscriptdescdash(void) 1178 { 1179 fprint(2, "tsscriptdescdash not implemented\n"); 1180 tstate = TSDATA; 1181 } 1182 1183 1184 void 1185 tsscriptdescddash(void) 1186 { 1187 fprint(2, "tsscriptdescddash not implemented\n"); 1188 tstate = TSDATA; 1189 } 1190 1191 1192 void 1193 tsscriptdescless(void) 1194 { 1195 fprint(2, "tsscriptdescless not implemented\n"); 1196 tstate = TSDATA; 1197 } 1198 1199 1200 void 1201 tsscriptdescend(void) 1202 { 1203 fprint(2, "tsscriptdescend not implemented\n"); 1204 tstate = TSDATA; 1205 } 1206 1207 1208 1209 void 1210 tsscriptendopen(void) 1211 { 1212 if (ALPHA(tc) != 0) { 1213 treconsume = 1; 1214 tstate = TSSCIRPT_END_NAME; 1215 } else { 1216 temit(chartok('<')); 1217 temit(chartok('/')); 1218 treconsume = 1; 1219 tstate = TSDATA; 1220 } 1221 } 1222 1223 void 1224 tsscriptless(void) 1225 { 1226 switch (tc) { 1227 case '/': 1228 s_reset(ctempbuf); 1229 tstate = TSSCRIPT_END_OPEN; 1230 break; 1231 case '!': 1232 tstate = TSSCRIPT_ESC_START; 1233 temit(chartok('<')); 1234 temit(chartok('!')); 1235 break; 1236 default: 1237 temit(chartok('<')); 1238 treconsume = 1; 1239 tstate = TSSCRIPT; 1240 } 1241 } 1242 1243 void 1244 tsrawtendname(void) 1245 { 1246 if (ALPHA(tc) != 0) { 1247 if (tc < 'a') tc+= 0x20; 1248 1249 } else if (1 /* appropriate end tag token */ ) { 1250 switch (tc) { 1251 case '\t': 1252 case '\n': 1253 case '\r': 1254 case ' ': 1255 tstate = TSANAME_BEFORE; 1256 break; 1257 case '/': 1258 tstate = TSSCSTAG; 1259 break; 1260 case '>': 1261 tstate = TSDATA; 1262 break; 1263 } 1264 } else { 1265 temit(chartok('<')); 1266 temit(chartok('/')); 1267 temitbuf(ctempbuf); 1268 treconsume = 1; 1269 tstate = TSRAWT; 1270 } 1271 } 1272 1273 void 1274 tsrawtendopen(void) 1275 { 1276 if (ALPHA(tc) != 0) { 1277 ctoken = newtok(TEND); 1278 treconsume = 1; 1279 tstate = TSRAWT; 1280 } else { 1281 temit(chartok('<')); 1282 temit(chartok('/')); 1283 treconsume = 1; 1284 } 1285 } 1286 1287 void 1288 tsrawtless(void) 1289 { 1290 if (tc == '/') { 1291 s_reset(ctempbuf); 1292 tstate = TSRAWT_END_OPEN; 1293 } else { 1294 temit(chartok('<')); 1295 treconsume = 1; 1296 } 1297 } 1298 1299 void 1300 tsrcdtendname(void) 1301 { 1302 if (talpha (1) != 0) return; 1303 if ( 1 /* appropriate end tag token ??? */) { 1304 switch (tc) { 1305 case '\t': 1306 case '\n': 1307 case '\r': 1308 case ' ': 1309 tstate = TSANAME_BEFORE; 1310 break; 1311 case '/': 1312 tstate = TSSCSTAG; 1313 break; 1314 case '>': 1315 tstate = TSDATA; 1316 temit(chartok(tc)); 1317 } 1318 } else { 1319 temit(chartok('<')); 1320 temit(chartok('/')); 1321 temitbuf(ctempbuf); 1322 treconsume = 1; 1323 tstate = TSRCDT; 1324 } 1325 } 1326 1327 void 1328 tsrcdtendopen(void) 1329 { 1330 if (ALPHA(tc) != 0) { 1331 ctoken = newtok(TEND); 1332 treconsume = 1; 1333 tstate = TSRCDT_END_NAME; 1334 } else { 1335 treconsume = 1; 1336 temit(chartok('<')); 1337 temit(chartok('/')); 1338 } 1339 } 1340 1341 void 1342 tsrcdtless(void) 1343 { 1344 switch (tc) { 1345 case '/': 1346 s_reset(ctempbuf); 1347 tstate = TSRCDT_END_OPEN; 1348 break; 1349 default: 1350 treconsume = 1; 1351 temit(chartok('<')); 1352 } 1353 } 1354 1355 void 1356 tstagname(void) 1357 { 1358 switch (tc) { 1359 case '\t': 1360 case '\n': 1361 case '\r': 1362 case ' ': 1363 s_terminate(ctoken->name); 1364 tstate = TSANAME_BEFORE; 1365 break; 1366 case '/': 1367 s_terminate(ctoken->name); 1368 tstate = TSSCSTAG; 1369 break; 1370 case '>': 1371 s_terminate(ctoken->name); 1372 temit(ctoken); 1373 tstate = TSDATA; 1374 break; 1375 case '\0': 1376 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1377 s_putc(ctoken->name, REPCHAR); 1378 break; 1379 case -1: 1380 fprint(2, "eof in tag parse error, tc='%c'\n", tc); 1381 teof = 1; 1382 temit(eoftok()); 1383 break; 1384 default: 1385 talpha(1); 1386 } 1387 } 1388 1389 void 1390 tsetagopen(void) 1391 { 1392 if (ALPHA(tc) != 0) { 1393 ctoken = newtok(TEND); 1394 treconsume = 1; 1395 tstate = TSTAG_NAME; 1396 } else switch (tc) { 1397 case '>': 1398 fprint(2, "missing end tag name parse error, tc='%c'\n", tc); 1399 tstate = TSDATA; 1400 break; 1401 case -1: 1402 fprint(2, "eof before tag name parse error, tc='%c'\n", tc); 1403 temit(chartok('<')); 1404 teof = 1; 1405 temit(eoftok()); 1406 break; 1407 default: 1408 fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc); 1409 ctoken = newtok(TCOMM); 1410 treconsume = 1; 1411 tstate = TSBOGUS_COMMENT; 1412 } 1413 } 1414 1415 void 1416 tstagopen(void) 1417 { 1418 if (ALPHA(tc) != 0) { 1419 ctoken = newtok(TSTART); 1420 treconsume = 1; 1421 tstate = TSTAG_NAME; 1422 } else switch (tc) { 1423 case '!': 1424 tstate = TSMKUP_OPEN; 1425 break; 1426 case '/': 1427 tstate = TSETAG_OPEN; 1428 break; 1429 case '?': 1430 fprint(2, "unexpected question mark instead of tag name parse error, tc='%c'\n", tc); 1431 ctoken = newtok(TCOMM); 1432 treconsume = 1; 1433 tstate = TSBOGUS_COMMENT; 1434 break; 1435 case -1: 1436 fprint(2, "eof before tag name parse error"); 1437 temit(chartok('<')); 1438 teof = 1; 1439 temit(eoftok()); 1440 break; 1441 default: 1442 fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc); 1443 temit(chartok('<')); 1444 treconsume = 1; 1445 tstate = TSDATA; 1446 } 1447 } 1448 1449 void 1450 tsptxt(void) 1451 { 1452 switch (tc) { 1453 case '\0': 1454 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1455 temit(chartok(REPCHAR)); 1456 break; 1457 case -1: /* EOF */ 1458 teof = 1; 1459 temit(eoftok()); 1460 break; 1461 default: 1462 temit(chartok(tc)); 1463 } 1464 } 1465 1466 void 1467 tsscript(void) 1468 { 1469 switch (tc) { 1470 case '<': 1471 tstate = TSSCRIPT_LESS; 1472 break; 1473 case '\0': 1474 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1475 temit(chartok(REPCHAR)); 1476 break; 1477 case -1: /* EOF */ 1478 teof = 1; 1479 temit(eoftok()); 1480 break; 1481 default: 1482 temit(chartok(tc)); 1483 } 1484 } 1485 1486 void 1487 tsrawt(void) 1488 { 1489 switch (tc) { 1490 case '<': 1491 tstate = TSRAWT_LESS; 1492 break; 1493 case '\0': 1494 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1495 temit(chartok(REPCHAR)); 1496 break; 1497 case -1: /* EOF */ 1498 teof = 1; 1499 temit(eoftok()); 1500 break; 1501 default: 1502 temit(chartok(tc)); 1503 } 1504 } 1505 1506 void 1507 tsrcdt(void) 1508 { 1509 switch (tc) { 1510 case '&': 1511 treturn = TSRCDT; 1512 tstate = TSCREF; 1513 break; 1514 case '<': 1515 tstate = TSRCDT_LESS; 1516 break; 1517 case '\0': 1518 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1519 temit(chartok(REPCHAR)); 1520 break; 1521 case -1: /* EOF */ 1522 teof = 1; 1523 temit(eoftok()); 1524 break; 1525 default: 1526 temit(chartok(tc)); 1527 } 1528 } 1529 1530 void 1531 tsdata(void) 1532 { 1533 switch (tc) { 1534 case '&': 1535 treturn = TSDATA; 1536 tstate = TSCREF; 1537 break; 1538 case '<': 1539 tstate = TSTAG_OPEN; 1540 break; 1541 case '\0': 1542 fprint(2, "unexpected null character parse error, tc='%c'\n", tc); 1543 temit(chartok(tc)); 1544 break; 1545 case -1: /* EOF */ 1546 teof = 1; 1547 temit(eoftok()); 1548 break; 1549 default: 1550 temit(chartok(tc)); 1551 } 1552 } 1553 1554 int 1555 talpha(int tolower) 1556 { 1557 if (ALPHA(tc) == 0) return 0; 1558 s_putc(ctempbuf, tc); 1559 if ((tolower != 0) && (tc < 'a')) tc+=0x20; 1560 s_putc(ctoken->name, tc); 1561 return 1; 1562 } 1563 1564 void 1565 tconsume(void) 1566 { 1567 char *buf; 1568 if (treconsume != 0) { 1569 treconsume = 0; 1570 return; 1571 } 1572 buf = s_to_c(clookaheadbuf); 1573 if (buf[0] != '\0') { 1574 tc = buf[0]; 1575 print("tc = %uX\n", tc); 1576 /* TODO make this code utf-aware */ 1577 String *shift; 1578 shift = s_copy(buf+1); 1579 s_free(clookaheadbuf); 1580 clookaheadbuf = shift; 1581 } 1582 else tc = gc(); 1583 } 1584 1585 void 1586 temitbuf(String *str) 1587 { 1588 Rune r; 1589 char *buf; 1590 int n, len; 1591 buf = s_to_c(str); 1592 len = strlen(buf); 1593 for (n = 0; n < len; n += chartorune(&r, buf+n)){ 1594 temit(chartok(r)); 1595 } 1596 1597 } 1598 1599 void 1600 temit(Token *t) 1601 { 1602 send(outchannel, &t); 1603 } 1604 1605 int 1606 gc(void) /* getchar func name is reserved by stdio.h */ 1607 { 1608 #define GCBUF 1024 1609 static char buf[GCBUF], *bp=buf+1; 1610 static long n = 0; 1611 if (bp > buf+n-1){ 1612 n = read(0, buf, GCBUF); 1613 if (n <= 0) return -1; 1614 bp = buf; 1615 } 1616 bp++; 1617 return *(bp-1); 1618 } 1619 1620 void 1621 threadtokenize(void *v) 1622 { 1623 Tokctl *tc; 1624 tc = v; 1625 outchannel = tc->c; 1626 teof = 0; 1627 threadsetname("tokenizer"); 1628 ctempbuf = s_new(); 1629 clookaheadbuf = s_new(); 1630 while (teof == 0) { 1631 if (tstate >= TMAX) { 1632 fprint(2, "[TOKENIZER] unknown tstate %d\n", tstate); 1633 break; 1634 } 1635 tconsume(); 1636 tstab[tstate](); 1637 } 1638 }