commit 14491eb420b3f9dbd0e5fda1f6e9623a2795250b
parent d33e0725e8ace2b57e1be3faf6c9451b7c8220e9
Author: Pavel Renev <an2qzavok@gmail.com>
Date: Mon, 14 Dec 2020 14:33:11 +0000
semiworking html5dom
Diffstat:
M | domfs.c | | | 16 | +++++++++------- |
D | html2dom.c | | | 1341 | ------------------------------------------------------------------------------- |
A | html5dom.c | | | 61 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | html5dom.h | | | 82 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | mkfile | | | 10 | ++++++---- |
A | ncref.h | | | 2241 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | tok.c | | | 1638 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | tree.c | | | 103 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
8 files changed, 4140 insertions(+), 1352 deletions(-)
diff --git a/domfs.c b/domfs.c
@@ -433,6 +433,7 @@ void
fswrite(Req *r)
{
char *buf, *rstr;
+ long off, nsize;
Finf *file;
Fusr *f;
file = r->fid->aux;
@@ -440,10 +441,13 @@ fswrite(Req *r)
case USER:
// TODO: finish this section
f = file->aux;
- f->nsize = r->ifcall.count;
+ off = r->ifcall.offset;
+ if (r->d.mode & DMAPPEND) off = f->nsize;
+ nsize = off + r->ifcall.count;
+ if (nsize > f->nsize) f->nsize = nsize;
f->data = realloc(f->data, f->nsize);
- memmove(f->data, r->ifcall.data, f->nsize);
- r->ofcall.count = f->nsize;
+ memmove(f->data + off, r->ifcall.data, r->ifcall.count);
+ r->ofcall.count = r->ifcall.count;
rstr = nil;
break;
case NCTL:
@@ -651,7 +655,7 @@ fsdestroyfid(Fid *fid)
void
usage(void)
{
- fprint(2, "usage %s [-D][-m /n/dom][-s service]\n", argv0);
+ fprint(2, "usage %s [-D][-m /mnt/dom][-s service]\n", argv0);
exits("usage");
}
@@ -660,7 +664,7 @@ main(int argc, char **argv)
{
char *srv, *mtpt;
srv = nil;
- mtpt = "/n/dom";
+ mtpt = "/mnt/dom";
ARGBEGIN {
case 'm':
@@ -682,8 +686,6 @@ main(int argc, char **argv)
stackpush(&files, fnew);
stackpush(&trees, newtree());
- //stackpush(&trees, newtree());
- //stackpush(&trees, newtree());
Srv fs = {
.attach = fsattach,
diff --git a/html2dom.c b/html2dom.c
@@ -1,1341 +0,0 @@
-#include <u.h>
-#include <libc.h>
-#include <String.h>
-#include <thread.h>
-
-#define ALPHA(x) ((x >=0x41) && (x <= 0x7a))
-
-static char *drpath = "/n/dom";
-static char *tpath = nil;
-
-int gc(void);
-
-/* Tokens code */
-
-typedef struct Attr Attr;
-
-struct Attr{
- String *name;
- String *value;
-};
-
-enum { /* Token types */
- TDOCT,
- TSTART,
- TEND,
- TCOMM,
- TCHAR,
- TTAG,
- TEOF = -1,
-};
-
-typedef struct Token Token;
-struct Token {
- int type;
- Rune c;
- String *name;
- Attr **attr;
-};
-
-Token* chartok(Rune);
-Token* eoftok(void);
-void t_free(Token*);
-Attr* tnewattr(Token*);
-void attr_free(Attr*);
-
-Token*
-eoftok(void)
-{
- Token *t;
- t = mallocz(sizeof(Token), 1);
- t->type = TEOF;
- return t;
-}
-
-Token*
-chartok(Rune c)
-{
- Token *t;
- t = mallocz(sizeof(Token), 1);
- t->c = c;
- t->type = TCHAR;
- return t;
-}
-
-void
-t_free(Token *t)
-{
- s_free(t->name);
- free(t);
-}
-
-
-Attr*
-tnewattr(Token *t)
-{
- int n;
- if (t->attr == nil) t->attr = mallocz(sizeof(Attr*), 1);
- for (n=0; (t->attr)[n] != nil; n++);
- t->attr = realloc(t->attr, (n + 1) * sizeof(Attr*));
- t->attr[n] = mallocz(sizeof(Attr), 1);
- t->attr[n]->name = s_new();
- t->attr[n]->value = s_new();
- t->attr[n+1] = nil;
- return t->attr[n];
-}
-
-void
-attr_free(Attr *attr)
-{
- s_free(attr->name);
- s_free(attr->value);
- free(attr);
-}
-
-/*
- * Insertion modes, as defined in
- * https://html.spec.whatwg.org/#the-insertion-mode
- */
-
-enum {
- IMinitial = 0,
- IMbefore_html = 1,
- IMbefore_head = 1 << 1,
- IMin_head = 1 << 2,
- IMin_head_noscript = 1 << 3,
- IMafter_head = 1 << 4,
- IMin_body = 1 << 5,
- IMtext = 1 << 6,
- IMin_table = 1 << 7,
- IMin_table_text = 1 << 8,
- IMin_caption = 1 << 9,
- IMin_column_group = 1 << 10,
- IMin_table_body = 1 << 11,
- IMin_row = 1 << 12,
- IMin_cell = 1 << 13,
- IMin_select = 1 << 14,
- IMin_select_in_table = 1 << 15,
- IMin_template = 1 << 16,
- IMafter_body = 1 << 17,
- IMin_frameset = 1 << 18,
- IMafter_frameset = 1 << 19,
- IMafter_after_body = 1 << 20,
- IMafter_after_frameset = 1 << 21,
-};
-
-u32int insertion_mode = IMinitial;
-
-/* Tokenizer vars and funcs */
-
-Rune tc;
-int treconsume = 0;
-int teof;
-
-Token *ctoken;
-Attr *cattr;
-String *ctempbuf;
-
-void tconsume(void);
-void temit(Token*);
-void temitbuf(void);
-int talpha(int);
-
-void tsdata(void);
-void tsrcdt(void);
-void tsrawt(void);
-void tsscript(void);
-void tsptxt(void);
-void tstagopen(void);
-void tsetagopen(void);
-void tstagname(void);
-void tsrcdtless(void);
-void tsrcdtendopen(void);
-void tsrcdtendname(void);
-void tsrawtless(void);
-void tsrawtendopen(void);
-void tsrawtendname(void);
-void tsscriptless(void);
-void tsscriptendopen(void);
-void tsscriptendname(void);
-
-void tsscriptescstart(void);
-void tsscriptescstartdash(void);
-void tsscriptesc(void);
-void tsscriptescdash(void);
-void tsscriptescddash(void);
-void tsscriptescless(void);
-void tsscriptescendopen(void);
-void tsscriptescendname(void);
-void tsscriptdescstart(void);
-void tsscriptdesc(void);
-void tsscriptdescdash(void);
-void tsscriptdescddash(void);
-void tsscriptdescless(void);
-void tsscriptdescend(void);
-
-void tsanamebefore(void);
-void tsaname(void);
-void tsanameafter(void);
-void tsavalbefore(void);
-void tsavaldq(void);
-void tsavalsq(void);
-void tsavaluq(void);
-void tsavalafter(void);
-void tsscstag(void);
-void tsboguscomment(void);
-void tsmkupopen(void);
-void tscommentstart(void);
-void tscommentstartdash(void);
-void tscomment(void);
-void tscommentless(void);
-void tscommentlessbang(void);
-void tscommentlessbangdash(void);
-void tscommentlessbangddash(void);
-void tscommentebddash(void);
-void tscommentebd(void);
-void tscommentebdbang(void);
-void tsdoct(void);
-void tsdoctbefore(void);
-void tsdoctname(void);
-void tsdoctnameafter(void);
-void tsdoctpubkafter(void);
-void tsdoctpubidbefore(void);
-void tsdoctpubiddq(void);
-void tsdoctpubidsq(void);
-void tsdoctpubidafter(void);
-void tsdoctbetween(void);
-void tsdoctsyskafter(void);
-void tsdoctsysidbefore(void);
-void tsdoctsysiddQ(void);
-void tsdoctsysidSQ(void);
-void tsdoctsysidafter(void);
-void tsdoctbogus(void);
-void tscdat(void);
-void tscdatbrk(void);
-void tscdatend(void);
-void tscref(void);
-void tsncref(void);
-void tsamam(void);
-void tsnumref(void);
-void tshexrefstart(void);
-void tsdecrefstart(void);
-void tshexref(void);
-void tsdecref(void);
-void tsnumrefend(void);
-
-
-#define REPCHAR Runeerror /* replacement character */
-
-enum {
- TSDATA, /* data */
- TSRCDT, /* RCDATA */
- TSRAWT, /* RAWTEXT */
- TSSCRIPT, /* script data */
- TSPTXT, /* PLAINTEXT */
- TSTAG_OPEN, /* tag open */
- TSETAG_OPEN, /* end tag open */
- TSTAG_NAME, /* tag name */
- TSRCDT_LESS, /* RCDATA less-than sign */
- TSRCDT_END_OPEN, /* RCDATA end tag open */
- TSRCDT_END_NAME, /* RCDATA end tag name */
- TSRAWT_LESS, /* RAWTEXT less-than sign */
- TSRAWT_END_OPEN, /* RAWTEXT end tag open */
- TSRAWT_END_NAME, /* RAWTEXT end tag name */
- TSSCRIPT_LESS, /* script data less-than sign */
- TSSCRIPT_END_OPEN, /* script data end tag open */
- TSSCIRPT_END_NAME, /* script data end tag name */
- TSSCRIPT_ESC_START, /* scirpt data escape start */
- TSSCRIPT_ESC_START_DASH, /* scirpt data escape start dash */
- TSSCRIPT_ESC, /* scirpt data escaped */
- TSSCRIPT_ESC_DASH, /* scirpt data escaped dash */
-
- TSSCRIPT_ESC_DDASH, /* scirpt data escaped dash dash */
- TSSCRIPT_ESC_LESS, /* scirpt data escaped less-than sign */
- TSSCRIPT_ESC_END_OPEN, /* scirpt data escaped end tag open */
- TSSCRIPT_ESC_END_NAME, /* scirpt data escaped end tag name */
- TSSCRIPT_DESC_START, /* scirpt data double escape start */
- TSSCRIPT_DESC, /* scirpt data double escaped */
- TSSCRIPT_DESC_DASH, /* scirpt data double escaped dash */
- TSSCRIPT_DESC_DDASH, /* scirpt data double escaped dash dash */
- TSSCRIPT_DESC_LESS, /* scirpt data double escaped less-than sign */
- TSSCRIPT_DESC_END, /* scirpt data double escape end */
-
- TSANAME_BEFORE, /* Before attribute name */
- TSANAME, /* Attribute name */
- TSANAME_AFTER, /* After attribute name */
- TSAVAL_BEFORE, /* Before attribute value */
- TSAVAL_DQ, /* Attribute value (double-quoted) */
- TSAVAL_SQ, /* Attribute value (single-quoted) */
- TSAVAL_UQ, /* Attribute value (unquoted) */
- TSAVAL_AFTER, /* After attribute value (quoted) */
-
- TSSCSTAG, /* Self-closing start tag */
- TSBOGUS_COMMENT, /* Bogus comment */
- TSMKUP_OPEN, /* Markup declaration open */
-
- TSCOMMENT_START, /* Comment start */
- TSCOMMENT_START_DASH, /* Comment start dash */
- TSCOMMENT, /* Comment */
- TSCOMMENT_LESS, /* Comment less-than sign */
- TSCOMMENT_LESS_BANG, /* Comment less-than sign bang */
- TSCOMMENT_LESS_BANG_DASH, /* Comment less-than sign bang dash */
- TSCOMMENT_LESS_BANG_DDASH, /* Comment less-than sign bang dash dash */
- TSCOMMENT_END_DASH, /* Comment end dash */
- TSCOMMENT_END, /* Comment end */
- TSCOMMENT_END_BANG, /* Comment end bang */
-
- TSDOCT, /* DOCTYPE */
- TSDOCT_BEFORE, /* Before DOCTYPE name */
- TSDOCT_NAME, /* DOCTYPE name */
- TSDOCT_NAME_AFTER, /* After DOCTYPE name */
- TSDOCT_PUBK_AFTER, /* After DOCTYPE public keyword */
- TSDOCT_PUBID_BEFORE, /* Before DOCTYPE public identifier */
- TSDOCT_PUBID_DQ, /* DOCTYPE public identifier (double-quoted) */
- TSDOCT_PUBID_SQ, /* DOCTYPE public identifier (single-quoted) */
- TSDOCT_PUBID_AFTER, /* After DOCTYPE public identifier */
- TSDOCT_BETWEEN, /* Between DOCTYPE public and system identifiers */
- TSDOCT_SYSK_AFTER, /* After DOCTYPE system keyword */
- TSDOCT_SYSID_BEFORE, /* Before DOCTYPE system identifier */
- TSDOCT_SYSID_DQ, /* DOCTYPE system identifier (double-quoted) */
- TSDOCT_SYSID_SQ, /* DOCTYPE system identifier (single-quoted) */
- TSDOCT_SYSID_AFTER, /* After DOCTYPE system identifier */
- TSDOCT_BOGUS, /* Bogus DOCTYPE */
-
- TSCDAT, /* CDATA section */
- TSCDAT_BRK, /* CDATA section bracket */
- TSCDAT_END, /* CDATA section end */
-
- TSCREF, /* Character reference */
- TSNCREF, /* Named character reference */
- TSAMAM, /* Ambiguous ampersand */
- TSNUMREF, /* Numeric character reference */
- TSHEXREF_START, /* Hexadecimal character reference start */
- TSDECREF_START, /* Decimal character reference start */
- TSHEXREF, /* Hexadecimal character reference */
- TSDECREF, /* Decimal character reference */
- TSNUMREF_END, /* Numeric character reference end */
-
- TMAX,
-};
-
-void (*tstab[])(void) = {
- [TSDATA] = tsdata,
- [TSRCDT] = tsrcdt,
- [TSRAWT] = tsrawt,
- [TSSCRIPT] = tsscript,
- [TSPTXT] = tsptxt,
- [TSTAG_OPEN] = tstagopen,
- [TSETAG_OPEN] = tsetagopen,
- [TSTAG_NAME] = tstagname,
- [TSRCDT_LESS] = tsrcdtless,
- [TSRCDT_END_OPEN] = tsrcdtendopen,
- [TSRCDT_END_NAME] = tsrcdtendname,
- [TSRAWT_LESS] = tsrawtless,
- [TSRAWT_END_OPEN] = tsrawtendopen,
- [TSSCRIPT_LESS] = tsscriptless,
- [TSSCRIPT_END_OPEN] = tsscriptendopen,
- [TSSCIRPT_END_NAME] = tsscriptendname,
- [TSSCRIPT_ESC_START] = tsscriptesc,
- [TSSCRIPT_ESC_START_DASH] = tsscriptesc,
- [TSSCRIPT_ESC] = tsscriptesc,
- [TSSCRIPT_ESC_DASH] = tsscriptescdash,
- [TSSCRIPT_ESC_DDASH] = tsscriptescddash,
- [TSSCRIPT_ESC_LESS] = tsscriptescless,
- [TSSCRIPT_ESC_END_OPEN] = tsscriptescendopen,
- [TSSCRIPT_ESC_END_NAME] = tsscriptescendname,
- [TSSCRIPT_DESC_START] = tsscriptdescstart,
- [TSSCRIPT_DESC] = tsscriptdesc,
- [TSSCRIPT_DESC_DASH] = tsscriptdescdash,
- [TSSCRIPT_DESC_DDASH] = tsscriptdescddash,
- [TSSCRIPT_DESC_LESS] = tsscriptdescless,
- [TSSCRIPT_DESC_END] = tsscriptdescend,
-
- [TSANAME_BEFORE] = tsanamebefore,
- [TSANAME] = tsaname,
- [TSANAME_AFTER] = nil,
- [TSAVAL_BEFORE] = nil,
- [TSAVAL_DQ] = nil,
- [TSAVAL_SQ] = nil,
- [TSAVAL_UQ] = nil,
- [TSAVAL_AFTER] = nil,
- [TSSCSTAG] = nil,
- [TSBOGUS_COMMENT] = nil,
- [TSMKUP_OPEN] = nil,
- [TSCOMMENT_START] = nil,
- [TSCOMMENT_START_DASH] = nil,
- [TSCOMMENT] = nil,
- [TSCOMMENT_LESS] = nil,
- [TSCOMMENT_LESS_BANG] = nil,
- [TSCOMMENT_LESS_BANG_DASH] = nil,
- [TSCOMMENT_LESS_BANG_DDASH] = nil,
- [TSCOMMENT_END_DASH] = nil,
- [TSCOMMENT_END] = nil,
- [TSCOMMENT_END_BANG] = nil,
- [TSDOCT] = nil,
- [TSDOCT_BEFORE] = nil,
- [TSDOCT_NAME] = nil,
- [TSDOCT_NAME_AFTER] = nil,
- [TSDOCT_PUBK_AFTER] = nil,
- [TSDOCT_PUBID_BEFORE] = nil,
- [TSDOCT_PUBID_DQ] = nil,
- [TSDOCT_PUBID_SQ] = nil,
- [TSDOCT_PUBID_AFTER] = nil,
- [TSDOCT_BETWEEN] = nil,
- [TSDOCT_SYSK_AFTER] = nil,
- [TSDOCT_SYSID_BEFORE] = nil,
- [TSDOCT_SYSID_DQ] = nil,
- [TSDOCT_SYSID_SQ] = nil,
- [TSDOCT_SYSID_AFTER] = nil,
- [TSDOCT_BOGUS] = nil,
- [TSCDAT] = nil,
- [TSCDAT_BRK] = nil,
- [TSCDAT_END] = nil,
- [TSCREF] = nil,
- [TSNCREF] = nil,
- [TSAMAM] = nil,
- [TSNUMREF] = nil,
- [TSHEXREF_START] = nil,
- [TSDECREF_START] = nil,
- [TSHEXREF] = nil,
- [TSDECREF] = nil,
- [TSNUMREF_END] = nil,
-};
-
-int tstate = TSDATA;
-int trstate = -1;
-
-void
-tsanamebefore(void)
-{
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- break;
- case '/':
- case '>':
- case -1:
- treconsume = 1;
- tstate = TSANAME_AFTER;
- break;
- case '=':
- fprint(2, "unexpected equals sign before attribute name parse error\n");
- cattr = tnewattr(ctoken);
- s_nappend(cattr->name, (char*)(&tc), 4);
- tstate = TSANAME;
- break;
- default:
- cattr = tnewattr(ctoken);
- treconsume = 1;
- tstate = TSANAME;
- }
-}
-
-void
-tsaname(void)
-{
- char buf[UTFmax];
- int n, err;
- if (ALPHA(tc) != 0) {
- if (tc < 'a') tc += 0x20;
- }
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- case '/':
- case '>':
- case -1:
- treconsume = 1;
- tstate = TSANAME_AFTER;
- break;
- case '=':
- tstate = TSAVAL_BEFORE;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- err = REPCHAR;
- s_nappend(cattr->name, (char *)(&err), 2);
- break;
- case '"':
- case '\'':
- case '<':
- fprint(2, "unexpected character in attribute name parse error\n");
- default:
- n = runetochar(buf, &tc);
- s_nappend(cattr->name, buf, n);
- }
-}
-
-void
-tsanameafter(void)
-{
-}
-
-void
-tsavalbefore(void)
-{
-}
-
-void
-tsavaldq(void)
-{
-}
-
-void
-tsavalsq(void)
-{
-}
-
-void
-tsavaluq(void)
-{
-}
-
-void
-tsavalafter(void)
-{
-}
-
-void
-tsscstag(void)
-{
-}
-
-void
-tsboguscomment(void)
-{
-}
-
-void
-tsmkupopen(void)
-{
-}
-
-void
-tscommentstart(void)
-{
-}
-
-void
-tscommentstartdash(void)
-{
-}
-
-void
-tscomment(void)
-{
-}
-
-void
-tscommentless(void)
-{
-}
-
-void
-tscommentlessbang(void)
-{
-}
-
-void
-tscommentlessbangdash(void)
-{
-}
-
-void
-tscommentlessbangddash(void)
-{
-}
-
-void
-tscommentebddash(void)
-{
-}
-
-void
-tscommentebd(void)
-{
-}
-
-void
-tscommentebdbang(void)
-{
-}
-
-void
-tsdoct(void)
-{
-}
-
-void
-tsdoctbefore(void)
-{
-}
-
-void
-tsdoctname(void)
-{
-}
-
-void
-tsdoctnameafter(void)
-{
-}
-
-void
-tsdoctpubkafter(void)
-{
-}
-
-void
-tsdoctpubidbefore(void)
-{
-}
-
-void
-tsdoctpubiddq(void)
-{
-}
-
-void
-tsdoctpubidsq(void)
-{
-}
-
-void
-tsdoctpubidafter(void)
-{
-}
-
-void
-tsdoctbetween(void)
-{
-}
-
-void
-tsdoctsyskafter(void)
-{
-}
-
-void
-tsdoctsysidbefore(void)
-{
-}
-
-void
-tsdoctsysiddQ(void)
-{
-}
-
-void
-tsdoctsysidSQ(void)
-{
-}
-
-void
-tsdoctsysidafter(void)
-{
-}
-
-void
-tsdoctbogus(void)
-{
-}
-
-void
-tscdat(void)
-{
-}
-
-void
-tscdatbrk(void)
-{
-}
-
-void
-tscdatend(void)
-{
-}
-
-void
-tscref(void)
-{
-}
-
-void
-tsncref(void)
-{
-}
-
-void
-tsamam(void)
-{
-}
-
-void
-tsnumref(void)
-{
-}
-
-void
-tshexrefstart(void)
-{
-}
-
-void
-tsdecrefstart(void)
-{
-}
-
-void
-tshexref(void)
-{
-}
-
-void
-tsdecref(void)
-{
-}
-
-void
-tsnumrefend(void)
-{
-}
-
-void
-tsscriptendname(void)
-{
- if (talpha(1) != 0) return;
- if (1 /* appropriate end tag token */) {
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- tstate = TSANAME_BEFORE;
- break;
- case '/':
- tstate = TSSCSTAG;
- break;
- case '>':
- tstate = TSDATA;
- break;
- }
- } else {
- temit(chartok('<'));
- temit(chartok('/'));
- temitbuf();
- }
-}
-
-
-void
-tsscriptescstart(void)
-{
- if (tc == '-') {
- tstate = TSSCRIPT_ESC_START_DASH;
- temit(chartok('-'));
- } else {
- treconsume = 1;
- tstate = TSSCRIPT;
- }
-}
-
-
-void
-tsscriptescstartdash(void)
-{
- if (tc == '-') {
- tstate = TSSCRIPT_ESC_DDASH;
- temit(chartok('-'));
- } else {
- treconsume = 1;
- tstate = TSSCRIPT;
- }
-}
-
-
-void
-tsscriptesc(void)
-{
- switch (tc) {
- case '-':
- tstate = TSSCRIPT_ESC_DASH;
- temit(chartok('-'));
- break;
- case '<':
- tstate = TSSCRIPT_ESC_LESS;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(REPCHAR));
- break;
- case -1: /* EOF */
- fprint(2, "eof in scipt html comment like text parse error\n");
- temit(eoftok());
- default:
- temit(chartok(tc));
- }
-}
-
-
-void
-tsscriptescdash(void)
-{
- switch (tc) {
- case '-':
- tstate = TSSCRIPT_ESC_DDASH;
- temit(chartok('-'));
- break;
- case '<':
- tstate = TSSCRIPT_ESC_LESS;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- tstate = TSSCRIPT_ESC;
- temit(chartok(REPCHAR));
- break;
- case -1:
- fprint(2, "eof in script html comment like text parse error\n");
- temit(eoftok());
- break;
- default:
- tstate = TSSCRIPT_ESC;
- temit(chartok(tc));
- }
-}
-
-
-void
-tsscriptescddash(void)
-{
-
-}
-
-
-void
-tsscriptescless(void)
-{
-}
-
-
-void
-tsscriptescendopen(void)
-{
-}
-
-
-void
-tsscriptescendname(void)
-{
-}
-
-
-void
-tsscriptdescstart(void)
-{
-}
-
-
-void
-tsscriptdesc(void)
-{
-}
-
-
-void
-tsscriptdescdash(void)
-{
-}
-
-
-void
-tsscriptdescddash(void)
-{
-}
-
-
-void
-tsscriptdescless(void)
-{
-}
-
-
-void
-tsscriptdescend(void)
-{
-}
-
-
-
-void
-tsscriptendopen(void)
-{
- if (ALPHA(tc) != 0) {
- treconsume = 1;
- tstate = TSSCIRPT_END_NAME;
- } else {
- temit(chartok('<'));
- temit(chartok('/'));
- treconsume = 1;
- tstate = TSDATA;
- }
-}
-
-void
-tsscriptless(void)
-{
- switch (tc) {
- case '/':
- s_reset(ctempbuf);
- tstate = TSSCRIPT_END_OPEN;
- break;
- case '!':
- tstate = TSSCRIPT_ESC_START;
- temit(chartok('<'));
- temit(chartok('!'));
- break;
- default:
- temit(chartok('<'));
- treconsume = 1;
- tstate = TSSCRIPT;
- }
-}
-
-void
-tsrawtendname(void)
-{
- if (ALPHA(tc) != 0) {
- if (tc < 'a') tc+= 0x20;
-
- } else if (1 /* appropriate end tag token */ ) {
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- tstate = TSANAME_BEFORE;
- break;
- case '/':
- tstate = TSSCSTAG;
- break;
- case '>':
- tstate = TSDATA;
- break;
- }
- } else {
- temit(chartok('<'));
- temit(chartok('/'));
- temitbuf();
- treconsume = 1;
- tstate = TSRAWT;
- }
-}
-
-void
-tsrawtendopen(void)
-{
- if (ALPHA(tc) != 0) {
- //TODO create new end tag token
- treconsume = 1;
- tstate = TSRAWT;
- } else {
- temit(chartok('<'));
- temit(chartok('/'));
- treconsume = 1;
- }
-}
-
-void
-tsrawtless(void)
-{
- if (tc == '/') {
- s_reset(ctempbuf);
- tstate = TSRAWT_END_OPEN;
- } else {
- temit(chartok('<'));
- treconsume = 1;
- }
-}
-
-void
-tsrcdtendname(void)
-{
- if (talpha (1) != 0) return;
- if ( 1 /* appropriate end tag token ??? */) {
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- tstate = TSANAME_BEFORE;
- break;
- case '/':
- tstate = TSSCSTAG;
- break;
- case '>':
- tstate = TSDATA;
- temit(chartok(tc));
- }
- } else {
- temit(chartok('<'));
- temit(chartok('/'));
- temitbuf();
- treconsume = 1;
- tstate = TSRCDT;
- }
-}
-
-void
-tsrcdtendopen(void)
-{
- if (ALPHA(tc) != 0) {
- //TODO create new end tag token
- treconsume = 1;
- tstate = TSRCDT_END_NAME;
- } else {
- treconsume = 1;
- temit(chartok('<'));
- temit(chartok('/'));
- }
-}
-
-void
-tsrcdtless(void)
-{
- switch (tc) {
- case '/':
- s_reset(ctempbuf);
- tstate = TSRCDT_END_OPEN;
- break;
- default:
- treconsume = 1;
- temit(chartok('<'));
- }
-}
-
-void
-tstagname(void)
-{
- uint err;
- err = REPCHAR;
- if (talpha(tc) != 0) return;
- switch (tc) {
- case '\t':
- case '\n':
- case '\r':
- case ' ':
- tstate = TSANAME_BEFORE;
- break;
- case '/':
- tstate = TSSCSTAG;
- break;
- case '>':
- // TODO emit tag
- tstate = TSDATA;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- s_nappend(ctoken->name, (char*)&err, 2);
- break;
- case -1:
- fprint(2, "eof in tag parse error\n");
- teof = 1;
- temit(eoftok());
- break;
- }
-}
-
-void
-tsetagopen(void)
-{
- if (ALPHA(tc) != 0) {
- // TODO: create new tag token
- treconsume = 1;
- tstate = TSTAG_NAME;
- } else switch (tc) {
- case '>':
- fprint(2, "missing end tag name parse error\n");
- tstate = TSDATA;
- break;
- case -1:
- fprint(2, "eof before tag name parse error\n");
- temit(chartok('<'));
- teof = 1;
- temit(eoftok());
- break;
- default:
- fprint(2, "invalid first character of tag name parse error\n");
- //TODO: create comment token
- treconsume = 1;
- tstate = TSBOGUS_COMMENT;
- }
-}
-
-void
-tstagopen(void)
-{
- if (ALPHA(tc) != 0) {
- // TODO: create new tag token
- treconsume = 1;
- tstate = TSTAG_NAME;
- } else switch (tc) {
- case '!':
- tstate = TSMKUP_OPEN;
- break;
- case '/':
- tstate = TSETAG_OPEN;
- break;
- case '?':
- fprint(2, "unexpected question mark instead of tag name parse error\n");
- // TODO create comment token
- treconsume = 1;
- tstate = TSBOGUS_COMMENT;
- break;
- case -1:
- fprint(2, "eof before tag name parse error");
- temit(chartok('<'));
- teof = 1;
- temit(eoftok());
- break;
- default:
- fprint(2, "invalid first character of tag name parse error\n");
- temit(chartok('<'));
- treconsume = 1;
- tstate = TSDATA;
- }
-}
-
-void
-tsptxt(void)
-{
- switch (tc) {
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(REPCHAR));
- break;
- case -1: /* EOF */
- teof = 1;
- temit(eoftok());
- break;
- default:
- temit(chartok(tc));
- }
-}
-
-void
-tsscript(void)
-{
- switch (tc) {
- case '<':
- tstate = TSSCRIPT_LESS;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(REPCHAR));
- break;
- case -1: /* EOF */
- teof = 1;
- temit(eoftok());
- break;
- default:
- temit(chartok(tc));
- }
-}
-
-void
-tsrawt(void)
-{
- switch (tc) {
- case '<':
- tstate = TSRAWT_LESS;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(REPCHAR));
- break;
- case -1: /* EOF */
- teof = 1;
- temit(eoftok());
- break;
- default:
- temit(chartok(tc));
- }
-}
-
-void
-tsrcdt(void)
-{
- switch (tc) {
- case '&':
- trstate = TSRCDT;
- tstate = TSCREF;
- break;
- case '<':
- tstate = TSRCDT_LESS;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(REPCHAR));
- break;
- case -1: /* EOF */
- teof = 1;
- temit(eoftok());
- break;
- default:
- temit(chartok(tc));
- }
-}
-
-void
-tsdata(void)
-{
- switch (tc) {
- case '&':
- trstate = TSDATA;
- tstate = TSCREF;
- break;
- case '<':
- tstate = TSTAG_OPEN;
- break;
- case '\0':
- fprint(2, "unexpected null character parse error\n");
- temit(chartok(tc));
- break;
- case -1: /* EOF */
- teof = 1;
- temit(eoftok());
- break;
- default:
- temit(chartok(tc));
- }
-}
-
-int
-talpha(int tolower)
-{
- char buf[UTFmax];
- int n;
- if (ALPHA(tc) == 0) return 0;
- n = runetochar(buf, &tc);
- s_nappend(ctempbuf, buf, n);
- if ((tolower != 0) && (tc < 'a')) tc+=0x20;
- n = runetochar(buf, &tc);
- s_nappend(ctoken->name, buf, n);
- return 1;
-}
-
-void
-tconsume(void)
-{
- if (treconsume == 0) tc = gc();
- treconsume = 0;
-}
-
-void
-temitbuf(void)
-{
- Rune r;
- char *buf;
- int n, len;
- buf = s_to_c(ctempbuf);
- len = strlen(buf);
- for (n = 0; n < len; n += chartorune(&r, buf+n)){
- temit(chartok(r));
- }
-
-}
-
-void
-temit(Token *t)
-{
- switch (t->type){
- case TCHAR:
- if (t->c == '\n') print("TCHAR \\n\n");
- else print("TCHAR %C\n", t->c);
- break;
- case TEOF:
- print("TEOF\n");
- break;
- default:
- print("TYPE %d\n", t->type);
- }
- t_free(t);
-}
-
-int
-gc(void) /* getchar func name is reserved by stdio.h */
-{
- #define GCBUF 1024
- static char buf[GCBUF], *bp=buf+1;
- static long n = 0;
- if (bp > buf+n-1){
- n = read(0, buf, GCBUF);
- if (n <= 0) return -1;
- bp = buf;
- }
- bp++;
- return *(bp-1);
-}
-
-void
-usage(void)
-{
- fprint(2, "usage: %s [-m /n/dom] [-n 123]\n", argv0);
- threadexitsall("usage");
-}
-
-void
-threadmain(int argc, char **argv)
-{
- //Dir *d;
- ARGBEGIN{
- case 'm':
- drpath = EARGF(usage());
- break;
- case 'n':
- tpath = EARGF(usage());
- default:
- usage();
- } ARGEND;
- if (argc != 0) usage();
- /*
- d = dirstat(drpath);
- if (d==nil) sysfatal("%r");
- if ((d->mode & DMDIR) == 0) sysfatal("%s - not a directory", drpath);
- if (chdir(drpath) == 0) sysfatal("%r");
- if (tpath == nil) {
- char *buf[128];
- long n;
- int fd;
- fd = open("new");
- if (fd < 0) sysfatal("can't open %s/new. %r", drpath);
- n = read(fd, buf, 128);
- if (n <= 0) sysfatal("failed to read from %s/new. %r", drpath);
- tpath = mallocz(n+1);
- memmove(tpath, buf, n);
- close(fd);
- fprint(1, "%s/%s\n", drpath, tpath);
- }
- if (chdir(tpath) == 0) sysfatal("%r");
- */
-
- print("--- START ---\n");
- teof = 0;
- ctempbuf = s_new();
- while(teof == 0){
- if (tstate >= TMAX) {
- fprint(2, "unknown tstate %d\n", tstate);
- break;
- }
- if (tstab[tstate] == nil) {
- fprint(2, "tstate %d not implemented\n", tstate);
- break;
- }
- tconsume();
- tstab[tstate]();
- }
- print("--- OVER ---\n");
-}
diff --git a/html5dom.c b/html5dom.c
@@ -0,0 +1,61 @@
+#include <u.h>
+#include <libc.h>
+#include <String.h>
+#include <thread.h>
+
+#include "html5dom.h"
+
+static char *drpath = "/mnt/dom";
+static char *tpath = nil;
+
+void
+usage(void)
+{
+ fprint(2, "usage: %s [-m /mnt/dom] [-n 123]\n", argv0);
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ Dir *d;
+ ARGBEGIN{
+ case 'm':
+ drpath = EARGF(usage());
+ break;
+ case 'n':
+ tpath = EARGF(usage());
+ default:
+ usage();
+ } ARGEND;
+ if (argc != 0) usage();
+
+ d = dirstat(drpath);
+ if (d==nil) sysfatal("%r");
+ if ((d->mode & DMDIR) == 0) sysfatal("%s - not a directory", drpath);
+ if (chdir(drpath) != 0) sysfatal("can't chdir to %s, %r", drpath);
+ if (tpath == nil) {
+ char *buf[128];
+ long n;
+ int fd;
+ fd = open("new", OREAD);
+ if (fd < 0) sysfatal("can't open %s/new. %r", drpath);
+ n = read(fd, buf, 128);
+ if (n <= 0) sysfatal("failed to read from %s/new. %r", drpath);
+ tpath = mallocz(n+1, 1);
+ memmove(tpath, buf, n);
+ if (tpath[n-1] == '\n') tpath[n-1] = '\0';
+ close(fd);
+ }
+ if (chdir(tpath) != 0) sysfatal("can't chdir to %s, %r", tpath);
+
+ Tokctl *tc;
+ Treeconstrctl *trc;
+ tc = malloc(sizeof(Tokctl));
+ tc->c = chancreate(sizeof(Token*), 1024);
+ trc = malloc(sizeof(Treeconstrctl));
+ trc->treeroot = ".";
+ trc->in = tc->c;
+ threadcreate(threadtokenize, tc, 64 * 1024);
+ threadcreate(threadtreeconstr, trc, 64 * 1024);
+}
diff --git a/html5dom.h b/html5dom.h
@@ -0,0 +1,82 @@
+typedef struct Attr Attr;
+
+struct Attr{
+ String *name;
+ String *value;
+};
+
+enum { /* Token types */
+ TDOCT,
+ TSTART,
+ TEND,
+ TCOMM,
+ TCHAR,
+ TEOF = -1,
+};
+
+enum { /* Token flags */
+ TF_FORCE_QUIRKS = 1,
+ TF_SELF_CLOSING = 1 << 1,
+};
+
+typedef struct Token Token;
+
+struct Token {
+ int type;
+ u64int flags;
+ Rune c;
+ String *name;
+ Attr **attr;
+};
+
+Token* chartok(Rune);
+Token* eoftok(void);
+Token* newtok(int);
+void t_free(Token*);
+Attr* tnewattr(Token*);
+void attr_free(Attr*);
+
+/*
+ * Insertion modes, as defined in
+ * https://html.spec.whatwg.org/#the-insertion-mode
+ */
+
+enum {
+ IMinitial = 0,
+ IMbefore_html = 1,
+ IMbefore_head = 1 << 1,
+ IMin_head = 1 << 2,
+ IMin_head_noscript = 1 << 3,
+ IMafter_head = 1 << 4,
+ IMin_body = 1 << 5,
+ IMtext = 1 << 6,
+ IMin_table = 1 << 7,
+ IMin_table_text = 1 << 8,
+ IMin_caption = 1 << 9,
+ IMin_column_group = 1 << 10,
+ IMin_table_body = 1 << 11,
+ IMin_row = 1 << 12,
+ IMin_cell = 1 << 13,
+ IMin_select = 1 << 14,
+ IMin_select_in_table = 1 << 15,
+ IMin_template = 1 << 16,
+ IMafter_body = 1 << 17,
+ IMin_frameset = 1 << 18,
+ IMafter_frameset = 1 << 19,
+ IMafter_after_body = 1 << 20,
+ IMafter_after_frameset = 1 << 21,
+};
+
+typedef struct Tokctl Tokctl;
+struct Tokctl {
+ Channel *c;
+};
+
+typedef struct Treeconstrctl Treeconstrctl;
+struct Treeconstrctl {
+ char *treeroot;
+ Channel *in;
+};
+
+void threadtokenize(void*);
+void threadtreeconstr(void*);
diff --git a/mkfile b/mkfile
@@ -2,11 +2,13 @@
TARG=\
domfs\
- html2dom\
+ html5dom\
-OFILES=\
-# domfs.$O\
-# html2dom.$O\
+HFILES=\
+ html5dom.h\
+ ncref.h\
BIN=/$objtype/bin
</sys/src/cmd/mkmany
+
+$O.html5dom: tok.$O tree.$O
diff --git a/ncref.h b/ncref.h
@@ -0,0 +1,2241 @@
+typedef struct Ncref Ncref;
+struct Ncref {
+ char *name;
+ Rune c;
+ Rune c2;
+};
+
+Ncref ncreftable[] = {
+ {"Æ", 198},
+ {"Æ", 198},
+ {"&", 38},
+ {"&", 38},
+ {"Á", 193},
+ {"Á", 193},
+ {"Ă", 258},
+ {"Â", 194},
+ {"Â", 194},
+ {"А", 1040},
+ {"𝔄", 120068},
+ {"À", 192},
+ {"À", 192},
+ {"Α", 913},
+ {"Ā", 256},
+ {"⩓", 10835},
+ {"Ą", 260},
+ {"𝔸", 120120},
+ {"⁡", 8289},
+ {"Å", 197},
+ {"Å", 197},
+ {"𝒜", 119964},
+ {"≔", 8788},
+ {"Ã", 195},
+ {"Ã", 195},
+ {"Ä", 196},
+ {"Ä", 196},
+ {"∖", 8726},
+ {"⫧", 10983},
+ {"⌆", 8966},
+ {"Б", 1041},
+ {"∵", 8757},
+ {"ℬ", 8492},
+ {"Β", 914},
+ {"𝔅", 120069},
+ {"𝔹", 120121},
+ {"˘", 728},
+ {"ℬ", 8492},
+ {"≎", 8782},
+ {"Ч", 1063},
+ {"©", 169},
+ {"©", 169},
+ {"Ć", 262},
+ {"⋒", 8914},
+ {"ⅅ", 8517},
+ {"ℭ", 8493},
+ {"Č", 268},
+ {"Ç", 199},
+ {"Ç", 199},
+ {"Ĉ", 264},
+ {"∰", 8752},
+ {"Ċ", 266},
+ {"¸", 184},
+ {"·", 183},
+ {"ℭ", 8493},
+ {"Χ", 935},
+ {"⊙", 8857},
+ {"⊖", 8854},
+ {"⊕", 8853},
+ {"⊗", 8855},
+ {"∲", 8754},
+ {"”", 8221},
+ {"’", 8217},
+ {"∷", 8759},
+ {"⩴", 10868},
+ {"≡", 8801},
+ {"∯", 8751},
+ {"∮", 8750},
+ {"ℂ", 8450},
+ {"∐", 8720},
+ {"∳", 8755},
+ {"⨯", 10799},
+ {"𝒞", 119966},
+ {"⋓", 8915},
+ {"≍", 8781},
+ {"ⅅ", 8517},
+ {"⤑", 10513},
+ {"Ђ", 1026},
+ {"Ѕ", 1029},
+ {"Џ", 1039},
+ {"‡", 8225},
+ {"↡", 8609},
+ {"⫤", 10980},
+ {"Ď", 270},
+ {"Д", 1044},
+ {"∇", 8711},
+ {"Δ", 916},
+ {"𝔇", 120071},
+ {"´", 180},
+ {"˙", 729},
+ {"˝", 733},
+ {"`", 96},
+ {"˜", 732},
+ {"⋄", 8900},
+ {"ⅆ", 8518},
+ {"𝔻", 120123},
+ {"¨", 168},
+ {"⃜", 8412},
+ {"≐", 8784},
+ {"∯", 8751},
+ {"¨", 168},
+ {"⇓", 8659},
+ {"⇐", 8656},
+ {"⇔", 8660},
+ {"⫤", 10980},
+ {"⟸", 10232},
+ {"⟺", 10234},
+ {"⟹", 10233},
+ {"⇒", 8658},
+ {"⊨", 8872},
+ {"⇑", 8657},
+ {"⇕", 8661},
+ {"∥", 8741},
+ {"↓", 8595},
+ {"⤓", 10515},
+ {"⇵", 8693},
+ {"̑", 785},
+ {"⥐", 10576},
+ {"⥞", 10590},
+ {"↽", 8637},
+ {"⥖", 10582},
+ {"⥟", 10591},
+ {"⇁", 8641},
+ {"⥗", 10583},
+ {"⊤", 8868},
+ {"↧", 8615},
+ {"⇓", 8659},
+ {"𝒟", 119967},
+ {"Đ", 272},
+ {"Ŋ", 330},
+ {"Ð", 208},
+ {"Ð", 208},
+ {"É", 201},
+ {"É", 201},
+ {"Ě", 282},
+ {"Ê", 202},
+ {"Ê", 202},
+ {"Э", 1069},
+ {"Ė", 278},
+ {"𝔈", 120072},
+ {"È", 200},
+ {"È", 200},
+ {"∈", 8712},
+ {"Ē", 274},
+ {"◻", 9723},
+ {"▫", 9643},
+ {"Ę", 280},
+ {"𝔼", 120124},
+ {"Ε", 917},
+ {"⩵", 10869},
+ {"≂", 8770},
+ {"⇌", 8652},
+ {"ℰ", 8496},
+ {"⩳", 10867},
+ {"Η", 919},
+ {"Ë", 203},
+ {"Ë", 203},
+ {"∃", 8707},
+ {"ⅇ", 8519},
+ {"Ф", 1060},
+ {"𝔉", 120073},
+ {"◼", 9724},
+ {"▪", 9642},
+ {"𝔽", 120125},
+ {"∀", 8704},
+ {"ℱ", 8497},
+ {"ℱ", 8497},
+ {"Ѓ", 1027},
+ {">", 62},
+ {">", 62},
+ {"Γ", 915},
+ {"Ϝ", 988},
+ {"Ğ", 286},
+ {"Ģ", 290},
+ {"Ĝ", 284},
+ {"Г", 1043},
+ {"Ġ", 288},
+ {"𝔊", 120074},
+ {"⋙", 8921},
+ {"𝔾", 120126},
+ {"≥", 8805},
+ {"⋛", 8923},
+ {"≧", 8807},
+ {"⪢", 10914},
+ {"≷", 8823},
+ {"⩾", 10878},
+ {"≳", 8819},
+ {"𝒢", 119970},
+ {"≫", 8811},
+ {"Ъ", 1066},
+ {"ˇ", 711},
+ {"^", 94},
+ {"Ĥ", 292},
+ {"ℌ", 8460},
+ {"ℋ", 8459},
+ {"ℍ", 8461},
+ {"─", 9472},
+ {"ℋ", 8459},
+ {"Ħ", 294},
+ {"≎", 8782},
+ {"≏", 8783},
+ {"Е", 1045},
+ {"IJ", 306},
+ {"Ё", 1025},
+ {"Í", 205},
+ {"Í", 205},
+ {"Î", 206},
+ {"Î", 206},
+ {"И", 1048},
+ {"İ", 304},
+ {"ℑ", 8465},
+ {"Ì", 204},
+ {"Ì", 204},
+ {"ℑ", 8465},
+ {"Ī", 298},
+ {"ⅈ", 8520},
+ {"⇒", 8658},
+ {"∬", 8748},
+ {"∫", 8747},
+ {"⋂", 8898},
+ {"⁣", 8291},
+ {"⁢", 8290},
+ {"Į", 302},
+ {"𝕀", 120128},
+ {"Ι", 921},
+ {"ℐ", 8464},
+ {"Ĩ", 296},
+ {"І", 1030},
+ {"Ï", 207},
+ {"Ï", 207},
+ {"Ĵ", 308},
+ {"Й", 1049},
+ {"𝔍", 120077},
+ {"𝕁", 120129},
+ {"𝒥", 119973},
+ {"Ј", 1032},
+ {"Є", 1028},
+ {"Х", 1061},
+ {"Ќ", 1036},
+ {"Κ", 922},
+ {"Ķ", 310},
+ {"К", 1050},
+ {"𝔎", 120078},
+ {"𝕂", 120130},
+ {"𝒦", 119974},
+ {"Љ", 1033},
+ {"<", 60},
+ {"<", 60},
+ {"Ĺ", 313},
+ {"Λ", 923},
+ {"⟪", 10218},
+ {"ℒ", 8466},
+ {"↞", 8606},
+ {"Ľ", 317},
+ {"Ļ", 315},
+ {"Л", 1051},
+ {"⟨", 10216},
+ {"←", 8592},
+ {"⇤", 8676},
+ {"⇆", 8646},
+ {"⌈", 8968},
+ {"⟦", 10214},
+ {"⥡", 10593},
+ {"⇃", 8643},
+ {"⥙", 10585},
+ {"⌊", 8970},
+ {"↔", 8596},
+ {"⥎", 10574},
+ {"⊣", 8867},
+ {"↤", 8612},
+ {"⥚", 10586},
+ {"⊲", 8882},
+ {"⧏", 10703},
+ {"⊴", 8884},
+ {"⥑", 10577},
+ {"⥠", 10592},
+ {"↿", 8639},
+ {"⥘", 10584},
+ {"↼", 8636},
+ {"⥒", 10578},
+ {"⇐", 8656},
+ {"⇔", 8660},
+ {"⋚", 8922},
+ {"≦", 8806},
+ {"≶", 8822},
+ {"⪡", 10913},
+ {"⩽", 10877},
+ {"≲", 8818},
+ {"𝔏", 120079},
+ {"⋘", 8920},
+ {"⇚", 8666},
+ {"Ŀ", 319},
+ {"⟵", 10229},
+ {"⟷", 10231},
+ {"⟶", 10230},
+ {"⟸", 10232},
+ {"⟺", 10234},
+ {"⟹", 10233},
+ {"𝕃", 120131},
+ {"↙", 8601},
+ {"↘", 8600},
+ {"ℒ", 8466},
+ {"↰", 8624},
+ {"Ł", 321},
+ {"≪", 8810},
+ {"⤅", 10501},
+ {"М", 1052},
+ {" ", 8287},
+ {"ℳ", 8499},
+ {"𝔐", 120080},
+ {"∓", 8723},
+ {"𝕄", 120132},
+ {"ℳ", 8499},
+ {"Μ", 924},
+ {"Њ", 1034},
+ {"Ń", 323},
+ {"Ň", 327},
+ {"Ņ", 325},
+ {"Н", 1053},
+ {"​", 8203},
+ {"​", 8203},
+ {"​", 8203},
+ {"​", 8203},
+ {"≫", 8811},
+ {"≪", 8810},
+ {"
", 10},
+ {"𝔑", 120081},
+ {"⁠", 8288},
+ {" ", 160},
+ {"ℕ", 8469},
+ {"⫬", 10988},
+ {"≢", 8802},
+ {"≭", 8813},
+ {"∦", 8742},
+ {"∉", 8713},
+ {"≠", 8800},
+ {"≂̸", 8770, 824},
+ {"∄", 8708},
+ {"≯", 8815},
+ {"≱", 8817},
+ {"≧̸", 8807, 824},
+ {"≫̸", 8811, 824},
+ {"≹", 8825},
+ {"⩾̸", 10878, 824},
+ {"≵", 8821},
+ {"≎̸", 8782, 824},
+ {"≏̸", 8783, 824},
+ {"⋪", 8938},
+ {"⧏̸", 10703, 824},
+ {"⋬", 8940},
+ {"≮", 8814},
+ {"≰", 8816},
+ {"≸", 8824},
+ {"≪̸", 8810, 824},
+ {"⩽̸", 10877, 824},
+ {"≴", 8820},
+ {"⪢̸", 10914, 824},
+ {"⪡̸", 10913, 824},
+ {"⊀", 8832},
+ {"⪯̸", 10927, 824},
+ {"⋠", 8928},
+ {"∌", 8716},
+ {"⋫", 8939},
+ {"⧐̸", 10704, 824},
+ {"⋭", 8941},
+ {"⊏̸", 8847, 824},
+ {"⋢", 8930},
+ {"⊐̸", 8848, 824},
+ {"⋣", 8931},
+ {"⊂⃒", 8834, 8402},
+ {"⊈", 8840},
+ {"⊁", 8833},
+ {"⪰̸", 10928, 824},
+ {"⋡", 8929},
+ {"≿̸", 8831, 824},
+ {"⊃⃒", 8835, 8402},
+ {"⊉", 8841},
+ {"≁", 8769},
+ {"≄", 8772},
+ {"≇", 8775},
+ {"≉", 8777},
+ {"∤", 8740},
+ {"𝒩", 119977},
+ {"Ñ", 209},
+ {"Ñ", 209},
+ {"Ν", 925},
+ {"Œ", 338},
+ {"Ó", 211},
+ {"Ó", 211},
+ {"Ô", 212},
+ {"Ô", 212},
+ {"О", 1054},
+ {"Ő", 336},
+ {"𝔒", 120082},
+ {"Ò", 210},
+ {"Ò", 210},
+ {"Ō", 332},
+ {"Ω", 937},
+ {"Ο", 927},
+ {"𝕆", 120134},
+ {"“", 8220},
+ {"‘", 8216},
+ {"⩔", 10836},
+ {"𝒪", 119978},
+ {"Ø", 216},
+ {"Ø", 216},
+ {"Õ", 213},
+ {"Õ", 213},
+ {"⨷", 10807},
+ {"Ö", 214},
+ {"Ö", 214},
+ {"‾", 8254},
+ {"⏞", 9182},
+ {"⎴", 9140},
+ {"⏜", 9180},
+ {"∂", 8706},
+ {"П", 1055},
+ {"𝔓", 120083},
+ {"Φ", 934},
+ {"Π", 928},
+ {"±", 177},
+ {"ℌ", 8460},
+ {"ℙ", 8473},
+ {"⪻", 10939},
+ {"≺", 8826},
+ {"⪯", 10927},
+ {"≼", 8828},
+ {"≾", 8830},
+ {"″", 8243},
+ {"∏", 8719},
+ {"∷", 8759},
+ {"∝", 8733},
+ {"𝒫", 119979},
+ {"Ψ", 936},
+ {""", 34},
+ {""", 34},
+ {"𝔔", 120084},
+ {"ℚ", 8474},
+ {"𝒬", 119980},
+ {"⤐", 10512},
+ {"®", 174},
+ {"®", 174},
+ {"Ŕ", 340},
+ {"⟫", 10219},
+ {"↠", 8608},
+ {"⤖", 10518},
+ {"Ř", 344},
+ {"Ŗ", 342},
+ {"Р", 1056},
+ {"ℜ", 8476},
+ {"∋", 8715},
+ {"⇋", 8651},
+ {"⥯", 10607},
+ {"ℜ", 8476},
+ {"Ρ", 929},
+ {"⟩", 10217},
+ {"→", 8594},
+ {"⇥", 8677},
+ {"⇄", 8644},
+ {"⌉", 8969},
+ {"⟧", 10215},
+ {"⥝", 10589},
+ {"⇂", 8642},
+ {"⥕", 10581},
+ {"⌋", 8971},
+ {"⊢", 8866},
+ {"↦", 8614},
+ {"⥛", 10587},
+ {"⊳", 8883},
+ {"⧐", 10704},
+ {"⊵", 8885},
+ {"⥏", 10575},
+ {"⥜", 10588},
+ {"↾", 8638},
+ {"⥔", 10580},
+ {"⇀", 8640},
+ {"⥓", 10579},
+ {"⇒", 8658},
+ {"ℝ", 8477},
+ {"⥰", 10608},
+ {"⇛", 8667},
+ {"ℛ", 8475},
+ {"↱", 8625},
+ {"⧴", 10740},
+ {"Щ", 1065},
+ {"Ш", 1064},
+ {"Ь", 1068},
+ {"Ś", 346},
+ {"⪼", 10940},
+ {"Š", 352},
+ {"Ş", 350},
+ {"Ŝ", 348},
+ {"С", 1057},
+ {"𝔖", 120086},
+ {"↓", 8595},
+ {"←", 8592},
+ {"→", 8594},
+ {"↑", 8593},
+ {"Σ", 931},
+ {"∘", 8728},
+ {"𝕊", 120138},
+ {"√", 8730},
+ {"□", 9633},
+ {"⊓", 8851},
+ {"⊏", 8847},
+ {"⊑", 8849},
+ {"⊐", 8848},
+ {"⊒", 8850},
+ {"⊔", 8852},
+ {"𝒮", 119982},
+ {"⋆", 8902},
+ {"⋐", 8912},
+ {"⋐", 8912},
+ {"⊆", 8838},
+ {"≻", 8827},
+ {"⪰", 10928},
+ {"≽", 8829},
+ {"≿", 8831},
+ {"∋", 8715},
+ {"∑", 8721},
+ {"⋑", 8913},
+ {"⊃", 8835},
+ {"⊇", 8839},
+ {"⋑", 8913},
+ {"Þ", 222},
+ {"Þ", 222},
+ {"™", 8482},
+ {"Ћ", 1035},
+ {"Ц", 1062},
+ {"	", 9},
+ {"Τ", 932},
+ {"Ť", 356},
+ {"Ţ", 354},
+ {"Т", 1058},
+ {"𝔗", 120087},
+ {"∴", 8756},
+ {"Θ", 920},
+ {"  ", 8287, 8202},
+ {" ", 8201},
+ {"∼", 8764},
+ {"≃", 8771},
+ {"≅", 8773},
+ {"≈", 8776},
+ {"𝕋", 120139},
+ {"⃛", 8411},
+ {"𝒯", 119983},
+ {"Ŧ", 358},
+ {"Ú", 218},
+ {"Ú", 218},
+ {"↟", 8607},
+ {"⥉", 10569},
+ {"Ў", 1038},
+ {"Ŭ", 364},
+ {"Û", 219},
+ {"Û", 219},
+ {"У", 1059},
+ {"Ű", 368},
+ {"𝔘", 120088},
+ {"Ù", 217},
+ {"Ù", 217},
+ {"Ū", 362},
+ {"_", 95},
+ {"⏟", 9183},
+ {"⎵", 9141},
+ {"⏝", 9181},
+ {"⋃", 8899},
+ {"⊎", 8846},
+ {"Ų", 370},
+ {"𝕌", 120140},
+ {"↑", 8593},
+ {"⤒", 10514},
+ {"⇅", 8645},
+ {"↕", 8597},
+ {"⥮", 10606},
+ {"⊥", 8869},
+ {"↥", 8613},
+ {"⇑", 8657},
+ {"⇕", 8661},
+ {"↖", 8598},
+ {"↗", 8599},
+ {"ϒ", 978},
+ {"Υ", 933},
+ {"Ů", 366},
+ {"𝒰", 119984},
+ {"Ũ", 360},
+ {"Ü", 220},
+ {"Ü", 220},
+ {"⊫", 8875},
+ {"⫫", 10987},
+ {"В", 1042},
+ {"⊩", 8873},
+ {"⫦", 10982},
+ {"⋁", 8897},
+ {"‖", 8214},
+ {"‖", 8214},
+ {"∣", 8739},
+ {"|", 124},
+ {"❘", 10072},
+ {"≀", 8768},
+ {" ", 8202},
+ {"𝔙", 120089},
+ {"𝕍", 120141},
+ {"𝒱", 119985},
+ {"⊪", 8874},
+ {"Ŵ", 372},
+ {"⋀", 8896},
+ {"𝔚", 120090},
+ {"𝕎", 120142},
+ {"𝒲", 119986},
+ {"𝔛", 120091},
+ {"Ξ", 926},
+ {"𝕏", 120143},
+ {"𝒳", 119987},
+ {"Я", 1071},
+ {"Ї", 1031},
+ {"Ю", 1070},
+ {"Ý", 221},
+ {"Ý", 221},
+ {"Ŷ", 374},
+ {"Ы", 1067},
+ {"𝔜", 120092},
+ {"𝕐", 120144},
+ {"𝒴", 119988},
+ {"Ÿ", 376},
+ {"Ж", 1046},
+ {"Ź", 377},
+ {"Ž", 381},
+ {"З", 1047},
+ {"Ż", 379},
+ {"​", 8203},
+ {"Ζ", 918},
+ {"ℨ", 8488},
+ {"ℤ", 8484},
+ {"𝒵", 119989},
+ {"á", 225},
+ {"á", 225},
+ {"ă", 259},
+ {"∾", 8766},
+ {"∾̳", 8766, 819},
+ {"∿", 8767},
+ {"â", 226},
+ {"â", 226},
+ {"´", 180},
+ {"´", 180},
+ {"а", 1072},
+ {"æ", 230},
+ {"æ", 230},
+ {"⁡", 8289},
+ {"𝔞", 120094},
+ {"à", 224},
+ {"à", 224},
+ {"ℵ", 8501},
+ {"ℵ", 8501},
+ {"α", 945},
+ {"ā", 257},
+ {"⨿", 10815},
+ {"&", 38},
+ {"&", 38},
+ {"∧", 8743},
+ {"⩕", 10837},
+ {"⩜", 10844},
+ {"⩘", 10840},
+ {"⩚", 10842},
+ {"∠", 8736},
+ {"⦤", 10660},
+ {"∠", 8736},
+ {"∡", 8737},
+ {"⦨", 10664},
+ {"⦩", 10665},
+ {"⦪", 10666},
+ {"⦫", 10667},
+ {"⦬", 10668},
+ {"⦭", 10669},
+ {"⦮", 10670},
+ {"⦯", 10671},
+ {"∟", 8735},
+ {"⊾", 8894},
+ {"⦝", 10653},
+ {"∢", 8738},
+ {"Å", 197},
+ {"⍼", 9084},
+ {"ą", 261},
+ {"𝕒", 120146},
+ {"≈", 8776},
+ {"⩰", 10864},
+ {"⩯", 10863},
+ {"≊", 8778},
+ {"≋", 8779},
+ {"'", 39},
+ {"≈", 8776},
+ {"≊", 8778},
+ {"å", 229},
+ {"å", 229},
+ {"𝒶", 119990},
+ {"*", 42},
+ {"≈", 8776},
+ {"≍", 8781},
+ {"ã", 227},
+ {"ã", 227},
+ {"ä", 228},
+ {"ä", 228},
+ {"∳", 8755},
+ {"⨑", 10769},
+ {"⫭", 10989},
+ {"≌", 8780},
+ {"϶", 1014},
+ {"‵", 8245},
+ {"∽", 8765},
+ {"⋍", 8909},
+ {"⊽", 8893},
+ {"⌅", 8965},
+ {"⌅", 8965},
+ {"⎵", 9141},
+ {"⎶", 9142},
+ {"≌", 8780},
+ {"б", 1073},
+ {"„", 8222},
+ {"∵", 8757},
+ {"∵", 8757},
+ {"⦰", 10672},
+ {"϶", 1014},
+ {"ℬ", 8492},
+ {"β", 946},
+ {"ℶ", 8502},
+ {"≬", 8812},
+ {"𝔟", 120095},
+ {"⋂", 8898},
+ {"◯", 9711},
+ {"⋃", 8899},
+ {"⨀", 10752},
+ {"⨁", 10753},
+ {"⨂", 10754},
+ {"⨆", 10758},
+ {"★", 9733},
+ {"▽", 9661},
+ {"△", 9651},
+ {"⨄", 10756},
+ {"⋁", 8897},
+ {"⋀", 8896},
+ {"⤍", 10509},
+ {"⧫", 10731},
+ {"▪", 9642},
+ {"▴", 9652},
+ {"▾", 9662},
+ {"◂", 9666},
+ {"▸", 9656},
+ {"␣", 9251},
+ {"▒", 9618},
+ {"░", 9617},
+ {"▓", 9619},
+ {"█", 9608},
+ {"=⃥", 61, 8421},
+ {"≡⃥", 8801, 8421},
+ {"⌐", 8976},
+ {"𝕓", 120147},
+ {"⊥", 8869},
+ {"⊥", 8869},
+ {"⋈", 8904},
+ {"╗", 9559},
+ {"╔", 9556},
+ {"╖", 9558},
+ {"╓", 9555},
+ {"═", 9552},
+ {"╦", 9574},
+ {"╩", 9577},
+ {"╤", 9572},
+ {"╧", 9575},
+ {"╝", 9565},
+ {"╚", 9562},
+ {"╜", 9564},
+ {"╙", 9561},
+ {"║", 9553},
+ {"╬", 9580},
+ {"╣", 9571},
+ {"╠", 9568},
+ {"╫", 9579},
+ {"╢", 9570},
+ {"╟", 9567},
+ {"⧉", 10697},
+ {"╕", 9557},
+ {"╒", 9554},
+ {"┐", 9488},
+ {"┌", 9484},
+ {"─", 9472},
+ {"╥", 9573},
+ {"╨", 9576},
+ {"┬", 9516},
+ {"┴", 9524},
+ {"⊟", 8863},
+ {"⊞", 8862},
+ {"⊠", 8864},
+ {"╛", 9563},
+ {"╘", 9560},
+ {"┘", 9496},
+ {"└", 9492},
+ {"│", 9474},
+ {"╪", 9578},
+ {"╡", 9569},
+ {"╞", 9566},
+ {"┼", 9532},
+ {"┤", 9508},
+ {"├", 9500},
+ {"‵", 8245},
+ {"˘", 728},
+ {"¦", 166},
+ {"¦", 166},
+ {"𝒷", 119991},
+ {"⁏", 8271},
+ {"∽", 8765},
+ {"⋍", 8909},
+ {"\", 92},
+ {"⧅", 10693},
+ {"⟈", 10184},
+ {"•", 8226},
+ {"•", 8226},
+ {"≎", 8782},
+ {"⪮", 10926},
+ {"≏", 8783},
+ {"≏", 8783},
+ {"ć", 263},
+ {"∩", 8745},
+ {"⩄", 10820},
+ {"⩉", 10825},
+ {"⩋", 10827},
+ {"⩇", 10823},
+ {"⩀", 10816},
+ {"∩︀", 8745, 65024},
+ {"⁁", 8257},
+ {"ˇ", 711},
+ {"⩍", 10829},
+ {"č", 269},
+ {"ç", 231},
+ {"ç", 231},
+ {"ĉ", 265},
+ {"⩌", 10828},
+ {"⩐", 10832},
+ {"ċ", 267},
+ {"¸", 184},
+ {"¸", 184},
+ {"⦲", 10674},
+ {"¢", 162},
+ {"¢", 162},
+ {"·", 183},
+ {"𝔠", 120096},
+ {"ч", 1095},
+ {"✓", 10003},
+ {"✓", 10003},
+ {"χ", 967},
+ {"○", 9675},
+ {"⧃", 10691},
+ {"ˆ", 710},
+ {"≗", 8791},
+ {"↺", 8634},
+ {"↻", 8635},
+ {"®", 174},
+ {"Ⓢ", 9416},
+ {"⊛", 8859},
+ {"⊚", 8858},
+ {"⊝", 8861},
+ {"≗", 8791},
+ {"⨐", 10768},
+ {"⫯", 10991},
+ {"⧂", 10690},
+ {"♣", 9827},
+ {"♣", 9827},
+ {":", 58},
+ {"≔", 8788},
+ {"≔", 8788},
+ {",", 44},
+ {"@", 64},
+ {"∁", 8705},
+ {"∘", 8728},
+ {"∁", 8705},
+ {"ℂ", 8450},
+ {"≅", 8773},
+ {"⩭", 10861},
+ {"∮", 8750},
+ {"𝕔", 120148},
+ {"∐", 8720},
+ {"©", 169},
+ {"©", 169},
+ {"℗", 8471},
+ {"↵", 8629},
+ {"✗", 10007},
+ {"𝒸", 119992},
+ {"⫏", 10959},
+ {"⫑", 10961},
+ {"⫐", 10960},
+ {"⫒", 10962},
+ {"⋯", 8943},
+ {"⤸", 10552},
+ {"⤵", 10549},
+ {"⋞", 8926},
+ {"⋟", 8927},
+ {"↶", 8630},
+ {"⤽", 10557},
+ {"∪", 8746},
+ {"⩈", 10824},
+ {"⩆", 10822},
+ {"⩊", 10826},
+ {"⊍", 8845},
+ {"⩅", 10821},
+ {"∪︀", 8746, 65024},
+ {"↷", 8631},
+ {"⤼", 10556},
+ {"⋞", 8926},
+ {"⋟", 8927},
+ {"⋎", 8910},
+ {"⋏", 8911},
+ {"¤", 164},
+ {"¤", 164},
+ {"↶", 8630},
+ {"↷", 8631},
+ {"⋎", 8910},
+ {"⋏", 8911},
+ {"∲", 8754},
+ {"∱", 8753},
+ {"⌭", 9005},
+ {"⇓", 8659},
+ {"⥥", 10597},
+ {"†", 8224},
+ {"ℸ", 8504},
+ {"↓", 8595},
+ {"‐", 8208},
+ {"⊣", 8867},
+ {"⤏", 10511},
+ {"˝", 733},
+ {"ď", 271},
+ {"д", 1076},
+ {"ⅆ", 8518},
+ {"‡", 8225},
+ {"⇊", 8650},
+ {"⩷", 10871},
+ {"°", 176},
+ {"°", 176},
+ {"δ", 948},
+ {"⦱", 10673},
+ {"⥿", 10623},
+ {"𝔡", 120097},
+ {"⇃", 8643},
+ {"⇂", 8642},
+ {"⋄", 8900},
+ {"⋄", 8900},
+ {"♦", 9830},
+ {"♦", 9830},
+ {"¨", 168},
+ {"ϝ", 989},
+ {"⋲", 8946},
+ {"÷", 247},
+ {"÷", 247},
+ {"÷", 247},
+ {"⋇", 8903},
+ {"⋇", 8903},
+ {"ђ", 1106},
+ {"⌞", 8990},
+ {"⌍", 8973},
+ {"$", 36},
+ {"𝕕", 120149},
+ {"˙", 729},
+ {"≐", 8784},
+ {"≑", 8785},
+ {"∸", 8760},
+ {"∔", 8724},
+ {"⊡", 8865},
+ {"⌆", 8966},
+ {"↓", 8595},
+ {"⇊", 8650},
+ {"⇃", 8643},
+ {"⇂", 8642},
+ {"⤐", 10512},
+ {"⌟", 8991},
+ {"⌌", 8972},
+ {"𝒹", 119993},
+ {"ѕ", 1109},
+ {"⧶", 10742},
+ {"đ", 273},
+ {"⋱", 8945},
+ {"▿", 9663},
+ {"▾", 9662},
+ {"⇵", 8693},
+ {"⥯", 10607},
+ {"⦦", 10662},
+ {"џ", 1119},
+ {"⟿", 10239},
+ {"⩷", 10871},
+ {"≑", 8785},
+ {"é", 233},
+ {"é", 233},
+ {"⩮", 10862},
+ {"ě", 283},
+ {"≖", 8790},
+ {"ê", 234},
+ {"ê", 234},
+ {"≕", 8789},
+ {"э", 1101},
+ {"ė", 279},
+ {"ⅇ", 8519},
+ {"≒", 8786},
+ {"𝔢", 120098},
+ {"⪚", 10906},
+ {"è", 232},
+ {"è", 232},
+ {"⪖", 10902},
+ {"⪘", 10904},
+ {"⪙", 10905},
+ {"⏧", 9191},
+ {"ℓ", 8467},
+ {"⪕", 10901},
+ {"⪗", 10903},
+ {"ē", 275},
+ {"∅", 8709},
+ {"∅", 8709},
+ {"∅", 8709},
+ {" ", 8196},
+ {" ", 8197},
+ {" ", 8195},
+ {"ŋ", 331},
+ {" ", 8194},
+ {"ę", 281},
+ {"𝕖", 120150},
+ {"⋕", 8917},
+ {"⧣", 10723},
+ {"⩱", 10865},
+ {"ε", 949},
+ {"ε", 949},
+ {"ϵ", 1013},
+ {"≖", 8790},
+ {"≕", 8789},
+ {"≂", 8770},
+ {"⪖", 10902},
+ {"⪕", 10901},
+ {"=", 61},
+ {"≟", 8799},
+ {"≡", 8801},
+ {"⩸", 10872},
+ {"⧥", 10725},
+ {"≓", 8787},
+ {"⥱", 10609},
+ {"ℯ", 8495},
+ {"≐", 8784},
+ {"≂", 8770},
+ {"η", 951},
+ {"ð", 240},
+ {"ð", 240},
+ {"ë", 235},
+ {"ë", 235},
+ {"€", 8364},
+ {"!", 33},
+ {"∃", 8707},
+ {"ℰ", 8496},
+ {"ⅇ", 8519},
+ {"≒", 8786},
+ {"ф", 1092},
+ {"♀", 9792},
+ {"ffi", 64259},
+ {"ff", 64256},
+ {"ffl", 64260},
+ {"𝔣", 120099},
+ {"fi", 64257},
+ {"fj", 102, 106},
+ {"♭", 9837},
+ {"fl", 64258},
+ {"▱", 9649},
+ {"ƒ", 402},
+ {"𝕗", 120151},
+ {"∀", 8704},
+ {"⋔", 8916},
+ {"⫙", 10969},
+ {"⨍", 10765},
+ {"½", 189},
+ {"½", 189},
+ {"⅓", 8531},
+ {"¼", 188},
+ {"¼", 188},
+ {"⅕", 8533},
+ {"⅙", 8537},
+ {"⅛", 8539},
+ {"⅔", 8532},
+ {"⅖", 8534},
+ {"¾", 190},
+ {"¾", 190},
+ {"⅗", 8535},
+ {"⅜", 8540},
+ {"⅘", 8536},
+ {"⅚", 8538},
+ {"⅝", 8541},
+ {"⅞", 8542},
+ {"⁄", 8260},
+ {"⌢", 8994},
+ {"𝒻", 119995},
+ {"≧", 8807},
+ {"⪌", 10892},
+ {"ǵ", 501},
+ {"γ", 947},
+ {"ϝ", 989},
+ {"⪆", 10886},
+ {"ğ", 287},
+ {"ĝ", 285},
+ {"г", 1075},
+ {"ġ", 289},
+ {"≥", 8805},
+ {"⋛", 8923},
+ {"≥", 8805},
+ {"≧", 8807},
+ {"⩾", 10878},
+ {"⩾", 10878},
+ {"⪩", 10921},
+ {"⪀", 10880},
+ {"⪂", 10882},
+ {"⪄", 10884},
+ {"⋛︀", 8923, 65024},
+ {"⪔", 10900},
+ {"𝔤", 120100},
+ {"≫", 8811},
+ {"⋙", 8921},
+ {"ℷ", 8503},
+ {"ѓ", 1107},
+ {"≷", 8823},
+ {"⪒", 10898},
+ {"⪥", 10917},
+ {"⪤", 10916},
+ {"≩", 8809},
+ {"⪊", 10890},
+ {"⪊", 10890},
+ {"⪈", 10888},
+ {"⪈", 10888},
+ {"≩", 8809},
+ {"⋧", 8935},
+ {"𝕘", 120152},
+ {"`", 96},
+ {"ℊ", 8458},
+ {"≳", 8819},
+ {"⪎", 10894},
+ {"⪐", 10896},
+ {">", 62},
+ {">", 62},
+ {"⪧", 10919},
+ {"⩺", 10874},
+ {"⋗", 8919},
+ {"⦕", 10645},
+ {"⩼", 10876},
+ {"⪆", 10886},
+ {"⥸", 10616},
+ {"⋗", 8919},
+ {"⋛", 8923},
+ {"⪌", 10892},
+ {"≷", 8823},
+ {"≳", 8819},
+ {"≩︀", 8809, 65024},
+ {"≩︀", 8809, 65024},
+ {"⇔", 8660},
+ {" ", 8202},
+ {"½", 189},
+ {"ℋ", 8459},
+ {"ъ", 1098},
+ {"↔", 8596},
+ {"⥈", 10568},
+ {"↭", 8621},
+ {"ℏ", 8463},
+ {"ĥ", 293},
+ {"♥", 9829},
+ {"♥", 9829},
+ {"…", 8230},
+ {"⊹", 8889},
+ {"𝔥", 120101},
+ {"⤥", 10533},
+ {"⤦", 10534},
+ {"⇿", 8703},
+ {"∻", 8763},
+ {"↩", 8617},
+ {"↪", 8618},
+ {"𝕙", 120153},
+ {"―", 8213},
+ {"𝒽", 119997},
+ {"ℏ", 8463},
+ {"ħ", 295},
+ {"⁃", 8259},
+ {"‐", 8208},
+ {"í", 237},
+ {"í", 237},
+ {"⁣", 8291},
+ {"î", 238},
+ {"î", 238},
+ {"и", 1080},
+ {"е", 1077},
+ {"¡", 161},
+ {"¡", 161},
+ {"⇔", 8660},
+ {"𝔦", 120102},
+ {"ì", 236},
+ {"ì", 236},
+ {"ⅈ", 8520},
+ {"⨌", 10764},
+ {"∭", 8749},
+ {"⧜", 10716},
+ {"℩", 8489},
+ {"ij", 307},
+ {"ī", 299},
+ {"ℑ", 8465},
+ {"ℐ", 8464},
+ {"ℑ", 8465},
+ {"ı", 305},
+ {"⊷", 8887},
+ {"Ƶ", 437},
+ {"∈", 8712},
+ {"℅", 8453},
+ {"∞", 8734},
+ {"⧝", 10717},
+ {"ı", 305},
+ {"∫", 8747},
+ {"⊺", 8890},
+ {"ℤ", 8484},
+ {"⊺", 8890},
+ {"⨗", 10775},
+ {"⨼", 10812},
+ {"ё", 1105},
+ {"į", 303},
+ {"𝕚", 120154},
+ {"ι", 953},
+ {"⨼", 10812},
+ {"¿", 191},
+ {"¿", 191},
+ {"𝒾", 119998},
+ {"∈", 8712},
+ {"⋹", 8953},
+ {"⋵", 8949},
+ {"⋴", 8948},
+ {"⋳", 8947},
+ {"∈", 8712},
+ {"⁢", 8290},
+ {"ĩ", 297},
+ {"і", 1110},
+ {"ï", 239},
+ {"ï", 239},
+ {"ĵ", 309},
+ {"й", 1081},
+ {"𝔧", 120103},
+ {"ȷ", 567},
+ {"𝕛", 120155},
+ {"𝒿", 119999},
+ {"ј", 1112},
+ {"є", 1108},
+ {"κ", 954},
+ {"ϰ", 1008},
+ {"ķ", 311},
+ {"к", 1082},
+ {"𝔨", 120104},
+ {"ĸ", 312},
+ {"х", 1093},
+ {"ќ", 1116},
+ {"𝕜", 120156},
+ {"𝓀", 120000},
+ {"⇚", 8666},
+ {"⇐", 8656},
+ {"⤛", 10523},
+ {"⤎", 10510},
+ {"≦", 8806},
+ {"⪋", 10891},
+ {"⥢", 10594},
+ {"ĺ", 314},
+ {"⦴", 10676},
+ {"ℒ", 8466},
+ {"λ", 955},
+ {"⟨", 10216},
+ {"⦑", 10641},
+ {"⟨", 10216},
+ {"⪅", 10885},
+ {"«", 171},
+ {"«", 171},
+ {"←", 8592},
+ {"⇤", 8676},
+ {"⤟", 10527},
+ {"⤝", 10525},
+ {"↩", 8617},
+ {"↫", 8619},
+ {"⤹", 10553},
+ {"⥳", 10611},
+ {"↢", 8610},
+ {"⪫", 10923},
+ {"⤙", 10521},
+ {"⪭", 10925},
+ {"⪭︀", 10925, 65024},
+ {"⤌", 10508},
+ {"❲", 10098},
+ {"{", 123},
+ {"[", 91},
+ {"⦋", 10635},
+ {"⦏", 10639},
+ {"⦍", 10637},
+ {"ľ", 318},
+ {"ļ", 316},
+ {"⌈", 8968},
+ {"{", 123},
+ {"л", 1083},
+ {"⤶", 10550},
+ {"“", 8220},
+ {"„", 8222},
+ {"⥧", 10599},
+ {"⥋", 10571},
+ {"↲", 8626},
+ {"≤", 8804},
+ {"←", 8592},
+ {"↢", 8610},
+ {"↽", 8637},
+ {"↼", 8636},
+ {"⇇", 8647},
+ {"↔", 8596},
+ {"⇆", 8646},
+ {"⇋", 8651},
+ {"↭", 8621},
+ {"⋋", 8907},
+ {"⋚", 8922},
+ {"≤", 8804},
+ {"≦", 8806},
+ {"⩽", 10877},
+ {"⩽", 10877},
+ {"⪨", 10920},
+ {"⩿", 10879},
+ {"⪁", 10881},
+ {"⪃", 10883},
+ {"⋚︀", 8922, 65024},
+ {"⪓", 10899},
+ {"⪅", 10885},
+ {"⋖", 8918},
+ {"⋚", 8922},
+ {"⪋", 10891},
+ {"≶", 8822},
+ {"≲", 8818},
+ {"⥼", 10620},
+ {"⌊", 8970},
+ {"𝔩", 120105},
+ {"≶", 8822},
+ {"⪑", 10897},
+ {"↽", 8637},
+ {"↼", 8636},
+ {"⥪", 10602},
+ {"▄", 9604},
+ {"љ", 1113},
+ {"≪", 8810},
+ {"⇇", 8647},
+ {"⌞", 8990},
+ {"⥫", 10603},
+ {"◺", 9722},
+ {"ŀ", 320},
+ {"⎰", 9136},
+ {"⎰", 9136},
+ {"≨", 8808},
+ {"⪉", 10889},
+ {"⪉", 10889},
+ {"⪇", 10887},
+ {"⪇", 10887},
+ {"≨", 8808},
+ {"⋦", 8934},
+ {"⟬", 10220},
+ {"⇽", 8701},
+ {"⟦", 10214},
+ {"⟵", 10229},
+ {"⟷", 10231},
+ {"⟼", 10236},
+ {"⟶", 10230},
+ {"↫", 8619},
+ {"↬", 8620},
+ {"⦅", 10629},
+ {"𝕝", 120157},
+ {"⨭", 10797},
+ {"⨴", 10804},
+ {"∗", 8727},
+ {"_", 95},
+ {"◊", 9674},
+ {"◊", 9674},
+ {"⧫", 10731},
+ {"(", 40},
+ {"⦓", 10643},
+ {"⇆", 8646},
+ {"⌟", 8991},
+ {"⇋", 8651},
+ {"⥭", 10605},
+ {"‎", 8206},
+ {"⊿", 8895},
+ {"‹", 8249},
+ {"𝓁", 120001},
+ {"↰", 8624},
+ {"≲", 8818},
+ {"⪍", 10893},
+ {"⪏", 10895},
+ {"[", 91},
+ {"‘", 8216},
+ {"‚", 8218},
+ {"ł", 322},
+ {"<", 60},
+ {"<", 60},
+ {"⪦", 10918},
+ {"⩹", 10873},
+ {"⋖", 8918},
+ {"⋋", 8907},
+ {"⋉", 8905},
+ {"⥶", 10614},
+ {"⩻", 10875},
+ {"⦖", 10646},
+ {"◃", 9667},
+ {"⊴", 8884},
+ {"◂", 9666},
+ {"⥊", 10570},
+ {"⥦", 10598},
+ {"≨︀", 8808, 65024},
+ {"≨︀", 8808, 65024},
+ {"∺", 8762},
+ {"¯", 175},
+ {"¯", 175},
+ {"♂", 9794},
+ {"✠", 10016},
+ {"✠", 10016},
+ {"↦", 8614},
+ {"↦", 8614},
+ {"↧", 8615},
+ {"↤", 8612},
+ {"↥", 8613},
+ {"▮", 9646},
+ {"⨩", 10793},
+ {"м", 1084},
+ {"—", 8212},
+ {"∡", 8737},
+ {"𝔪", 120106},
+ {"℧", 8487},
+ {"µ", 181},
+ {"µ", 181},
+ {"∣", 8739},
+ {"*", 42},
+ {"⫰", 10992},
+ {"·", 183},
+ {"·", 183},
+ {"−", 8722},
+ {"⊟", 8863},
+ {"∸", 8760},
+ {"⨪", 10794},
+ {"⫛", 10971},
+ {"…", 8230},
+ {"∓", 8723},
+ {"⊧", 8871},
+ {"𝕞", 120158},
+ {"∓", 8723},
+ {"𝓂", 120002},
+ {"∾", 8766},
+ {"μ", 956},
+ {"⊸", 8888},
+ {"⊸", 8888},
+ {"⋙̸", 8921, 824},
+ {"≫⃒", 8811, 8402},
+ {"≫̸", 8811, 824},
+ {"⇍", 8653},
+ {"⇎", 8654},
+ {"⋘̸", 8920, 824},
+ {"≪⃒", 8810, 8402},
+ {"≪̸", 8810, 824},
+ {"⇏", 8655},
+ {"⊯", 8879},
+ {"⊮", 8878},
+ {"∇", 8711},
+ {"ń", 324},
+ {"∠⃒", 8736, 8402},
+ {"≉", 8777},
+ {"⩰̸", 10864, 824},
+ {"≋̸", 8779, 824},
+ {"ʼn", 329},
+ {"≉", 8777},
+ {"♮", 9838},
+ {"♮", 9838},
+ {"ℕ", 8469},
+ {" ", 160},
+ {" ", 160},
+ {"≎̸", 8782, 824},
+ {"≏̸", 8783, 824},
+ {"⩃", 10819},
+ {"ň", 328},
+ {"ņ", 326},
+ {"≇", 8775},
+ {"⩭̸", 10861, 824},
+ {"⩂", 10818},
+ {"н", 1085},
+ {"–", 8211},
+ {"≠", 8800},
+ {"⇗", 8663},
+ {"⤤", 10532},
+ {"↗", 8599},
+ {"↗", 8599},
+ {"≐̸", 8784, 824},
+ {"≢", 8802},
+ {"⤨", 10536},
+ {"≂̸", 8770, 824},
+ {"∄", 8708},
+ {"∄", 8708},
+ {"𝔫", 120107},
+ {"≧̸", 8807, 824},
+ {"≱", 8817},
+ {"≱", 8817},
+ {"≧̸", 8807, 824},
+ {"⩾̸", 10878, 824},
+ {"⩾̸", 10878, 824},
+ {"≵", 8821},
+ {"≯", 8815},
+ {"≯", 8815},
+ {"⇎", 8654},
+ {"↮", 8622},
+ {"⫲", 10994},
+ {"∋", 8715},
+ {"⋼", 8956},
+ {"⋺", 8954},
+ {"∋", 8715},
+ {"њ", 1114},
+ {"⇍", 8653},
+ {"≦̸", 8806, 824},
+ {"↚", 8602},
+ {"‥", 8229},
+ {"≰", 8816},
+ {"↚", 8602},
+ {"↮", 8622},
+ {"≰", 8816},
+ {"≦̸", 8806, 824},
+ {"⩽̸", 10877, 824},
+ {"⩽̸", 10877, 824},
+ {"≮", 8814},
+ {"≴", 8820},
+ {"≮", 8814},
+ {"⋪", 8938},
+ {"⋬", 8940},
+ {"∤", 8740},
+ {"𝕟", 120159},
+ {"¬", 172},
+ {"¬", 172},
+ {"∉", 8713},
+ {"⋹̸", 8953, 824},
+ {"⋵̸", 8949, 824},
+ {"∉", 8713},
+ {"⋷", 8951},
+ {"⋶", 8950},
+ {"∌", 8716},
+ {"∌", 8716},
+ {"⋾", 8958},
+ {"⋽", 8957},
+ {"∦", 8742},
+ {"∦", 8742},
+ {"⫽⃥", 11005, 8421},
+ {"∂̸", 8706, 824},
+ {"⨔", 10772},
+ {"⊀", 8832},
+ {"⋠", 8928},
+ {"⪯̸", 10927, 824},
+ {"⊀", 8832},
+ {"⪯̸", 10927, 824},
+ {"⇏", 8655},
+ {"↛", 8603},
+ {"⤳̸", 10547, 824},
+ {"↝̸", 8605, 824},
+ {"↛", 8603},
+ {"⋫", 8939},
+ {"⋭", 8941},
+ {"⊁", 8833},
+ {"⋡", 8929},
+ {"⪰̸", 10928, 824},
+ {"𝓃", 120003},
+ {"∤", 8740},
+ {"∦", 8742},
+ {"≁", 8769},
+ {"≄", 8772},
+ {"≄", 8772},
+ {"∤", 8740},
+ {"∦", 8742},
+ {"⋢", 8930},
+ {"⋣", 8931},
+ {"⊄", 8836},
+ {"⫅̸", 10949, 824},
+ {"⊈", 8840},
+ {"⊂⃒", 8834, 8402},
+ {"⊈", 8840},
+ {"⫅̸", 10949, 824},
+ {"⊁", 8833},
+ {"⪰̸", 10928, 824},
+ {"⊅", 8837},
+ {"⫆̸", 10950, 824},
+ {"⊉", 8841},
+ {"⊃⃒", 8835, 8402},
+ {"⊉", 8841},
+ {"⫆̸", 10950, 824},
+ {"≹", 8825},
+ {"ñ", 241},
+ {"ñ", 241},
+ {"≸", 8824},
+ {"⋪", 8938},
+ {"⋬", 8940},
+ {"⋫", 8939},
+ {"⋭", 8941},
+ {"ν", 957},
+ {"#", 35},
+ {"№", 8470},
+ {" ", 8199},
+ {"⊭", 8877},
+ {"⤄", 10500},
+ {"≍⃒", 8781, 8402},
+ {"⊬", 8876},
+ {"≥⃒", 8805, 8402},
+ {">⃒", 62, 8402},
+ {"⧞", 10718},
+ {"⤂", 10498},
+ {"≤⃒", 8804, 8402},
+ {"<⃒", 60, 8402},
+ {"⊴⃒", 8884, 8402},
+ {"⤃", 10499},
+ {"⊵⃒", 8885, 8402},
+ {"∼⃒", 8764, 8402},
+ {"⇖", 8662},
+ {"⤣", 10531},
+ {"↖", 8598},
+ {"↖", 8598},
+ {"⤧", 10535},
+ {"Ⓢ", 9416},
+ {"ó", 243},
+ {"ó", 243},
+ {"⊛", 8859},
+ {"⊚", 8858},
+ {"ô", 244},
+ {"ô", 244},
+ {"о", 1086},
+ {"⊝", 8861},
+ {"ő", 337},
+ {"⨸", 10808},
+ {"⊙", 8857},
+ {"⦼", 10684},
+ {"œ", 339},
+ {"⦿", 10687},
+ {"𝔬", 120108},
+ {"˛", 731},
+ {"ò", 242},
+ {"ò", 242},
+ {"⧁", 10689},
+ {"⦵", 10677},
+ {"Ω", 937},
+ {"∮", 8750},
+ {"↺", 8634},
+ {"⦾", 10686},
+ {"⦻", 10683},
+ {"‾", 8254},
+ {"⧀", 10688},
+ {"ō", 333},
+ {"ω", 969},
+ {"ο", 959},
+ {"⦶", 10678},
+ {"⊖", 8854},
+ {"𝕠", 120160},
+ {"⦷", 10679},
+ {"⦹", 10681},
+ {"⊕", 8853},
+ {"∨", 8744},
+ {"↻", 8635},
+ {"⩝", 10845},
+ {"ℴ", 8500},
+ {"ℴ", 8500},
+ {"ª", 170},
+ {"ª", 170},
+ {"º", 186},
+ {"º", 186},
+ {"⊶", 8886},
+ {"⩖", 10838},
+ {"⩗", 10839},
+ {"⩛", 10843},
+ {"ℴ", 8500},
+ {"ø", 248},
+ {"ø", 248},
+ {"⊘", 8856},
+ {"õ", 245},
+ {"õ", 245},
+ {"⊗", 8855},
+ {"⨶", 10806},
+ {"ö", 246},
+ {"ö", 246},
+ {"⌽", 9021},
+ {"∥", 8741},
+ {"¶", 182},
+ {"¶", 182},
+ {"∥", 8741},
+ {"⫳", 10995},
+ {"⫽", 11005},
+ {"∂", 8706},
+ {"п", 1087},
+ {"%", 37},
+ {".", 46},
+ {"‰", 8240},
+ {"⊥", 8869},
+ {"‱", 8241},
+ {"𝔭", 120109},
+ {"φ", 966},
+ {"ϕ", 981},
+ {"ℳ", 8499},
+ {"☎", 9742},
+ {"π", 960},
+ {"⋔", 8916},
+ {"ϖ", 982},
+ {"ℏ", 8463},
+ {"ℎ", 8462},
+ {"ℏ", 8463},
+ {"+", 43},
+ {"⨣", 10787},
+ {"⊞", 8862},
+ {"⨢", 10786},
+ {"∔", 8724},
+ {"⨥", 10789},
+ {"⩲", 10866},
+ {"±", 177},
+ {"±", 177},
+ {"⨦", 10790},
+ {"⨧", 10791},
+ {"±", 177},
+ {"⨕", 10773},
+ {"𝕡", 120161},
+ {"£", 163},
+ {"£", 163},
+ {"≺", 8826},
+ {"⪳", 10931},
+ {"⪷", 10935},
+ {"≼", 8828},
+ {"⪯", 10927},
+ {"≺", 8826},
+ {"⪷", 10935},
+ {"≼", 8828},
+ {"⪯", 10927},
+ {"⪹", 10937},
+ {"⪵", 10933},
+ {"⋨", 8936},
+ {"≾", 8830},
+ {"′", 8242},
+ {"ℙ", 8473},
+ {"⪵", 10933},
+ {"⪹", 10937},
+ {"⋨", 8936},
+ {"∏", 8719},
+ {"⌮", 9006},
+ {"⌒", 8978},
+ {"⌓", 8979},
+ {"∝", 8733},
+ {"∝", 8733},
+ {"≾", 8830},
+ {"⊰", 8880},
+ {"𝓅", 120005},
+ {"ψ", 968},
+ {" ", 8200},
+ {"𝔮", 120110},
+ {"⨌", 10764},
+ {"𝕢", 120162},
+ {"⁗", 8279},
+ {"𝓆", 120006},
+ {"ℍ", 8461},
+ {"⨖", 10774},
+ {"?", 63},
+ {"≟", 8799},
+ {""", 34},
+ {""", 34},
+ {"⇛", 8667},
+ {"⇒", 8658},
+ {"⤜", 10524},
+ {"⤏", 10511},
+ {"⥤", 10596},
+ {"∽̱", 8765, 817},
+ {"ŕ", 341},
+ {"√", 8730},
+ {"⦳", 10675},
+ {"⟩", 10217},
+ {"⦒", 10642},
+ {"⦥", 10661},
+ {"⟩", 10217},
+ {"»", 187},
+ {"»", 187},
+ {"→", 8594},
+ {"⥵", 10613},
+ {"⇥", 8677},
+ {"⤠", 10528},
+ {"⤳", 10547},
+ {"⤞", 10526},
+ {"↪", 8618},
+ {"↬", 8620},
+ {"⥅", 10565},
+ {"⥴", 10612},
+ {"↣", 8611},
+ {"↝", 8605},
+ {"⤚", 10522},
+ {"∶", 8758},
+ {"ℚ", 8474},
+ {"⤍", 10509},
+ {"❳", 10099},
+ {"}", 125},
+ {"]", 93},
+ {"⦌", 10636},
+ {"⦎", 10638},
+ {"⦐", 10640},
+ {"ř", 345},
+ {"ŗ", 343},
+ {"⌉", 8969},
+ {"}", 125},
+ {"р", 1088},
+ {"⤷", 10551},
+ {"⥩", 10601},
+ {"”", 8221},
+ {"”", 8221},
+ {"↳", 8627},
+ {"ℜ", 8476},
+ {"ℛ", 8475},
+ {"ℜ", 8476},
+ {"ℝ", 8477},
+ {"▭", 9645},
+ {"®", 174},
+ {"®", 174},
+ {"⥽", 10621},
+ {"⌋", 8971},
+ {"𝔯", 120111},
+ {"⇁", 8641},
+ {"⇀", 8640},
+ {"⥬", 10604},
+ {"ρ", 961},
+ {"ϱ", 1009},
+ {"→", 8594},
+ {"↣", 8611},
+ {"⇁", 8641},
+ {"⇀", 8640},
+ {"⇄", 8644},
+ {"⇌", 8652},
+ {"⇉", 8649},
+ {"↝", 8605},
+ {"⋌", 8908},
+ {"˚", 730},
+ {"≓", 8787},
+ {"⇄", 8644},
+ {"⇌", 8652},
+ {"‏", 8207},
+ {"⎱", 9137},
+ {"⎱", 9137},
+ {"⫮", 10990},
+ {"⟭", 10221},
+ {"⇾", 8702},
+ {"⟧", 10215},
+ {"⦆", 10630},
+ {"𝕣", 120163},
+ {"⨮", 10798},
+ {"⨵", 10805},
+ {")", 41},
+ {"⦔", 10644},
+ {"⨒", 10770},
+ {"⇉", 8649},
+ {"›", 8250},
+ {"𝓇", 120007},
+ {"↱", 8625},
+ {"]", 93},
+ {"’", 8217},
+ {"’", 8217},
+ {"⋌", 8908},
+ {"⋊", 8906},
+ {"▹", 9657},
+ {"⊵", 8885},
+ {"▸", 9656},
+ {"⧎", 10702},
+ {"⥨", 10600},
+ {"℞", 8478},
+ {"ś", 347},
+ {"‚", 8218},
+ {"≻", 8827},
+ {"⪴", 10932},
+ {"⪸", 10936},
+ {"š", 353},
+ {"≽", 8829},
+ {"⪰", 10928},
+ {"ş", 351},
+ {"ŝ", 349},
+ {"⪶", 10934},
+ {"⪺", 10938},
+ {"⋩", 8937},
+ {"⨓", 10771},
+ {"≿", 8831},
+ {"с", 1089},
+ {"⋅", 8901},
+ {"⊡", 8865},
+ {"⩦", 10854},
+ {"⇘", 8664},
+ {"⤥", 10533},
+ {"↘", 8600},
+ {"↘", 8600},
+ {"§", 167},
+ {"§", 167},
+ {";", 59},
+ {"⤩", 10537},
+ {"∖", 8726},
+ {"∖", 8726},
+ {"✶", 10038},
+ {"𝔰", 120112},
+ {"⌢", 8994},
+ {"♯", 9839},
+ {"щ", 1097},
+ {"ш", 1096},
+ {"∣", 8739},
+ {"∥", 8741},
+ {"­", 173},
+ {"­", 173},
+ {"σ", 963},
+ {"ς", 962},
+ {"ς", 962},
+ {"∼", 8764},
+ {"⩪", 10858},
+ {"≃", 8771},
+ {"≃", 8771},
+ {"⪞", 10910},
+ {"⪠", 10912},
+ {"⪝", 10909},
+ {"⪟", 10911},
+ {"≆", 8774},
+ {"⨤", 10788},
+ {"⥲", 10610},
+ {"←", 8592},
+ {"∖", 8726},
+ {"⨳", 10803},
+ {"⧤", 10724},
+ {"∣", 8739},
+ {"⌣", 8995},
+ {"⪪", 10922},
+ {"⪬", 10924},
+ {"⪬︀", 10924, 65024},
+ {"ь", 1100},
+ {"/", 47},
+ {"⧄", 10692},
+ {"⌿", 9023},
+ {"𝕤", 120164},
+ {"♠", 9824},
+ {"♠", 9824},
+ {"∥", 8741},
+ {"⊓", 8851},
+ {"⊓︀", 8851, 65024},
+ {"⊔", 8852},
+ {"⊔︀", 8852, 65024},
+ {"⊏", 8847},
+ {"⊑", 8849},
+ {"⊏", 8847},
+ {"⊑", 8849},
+ {"⊐", 8848},
+ {"⊒", 8850},
+ {"⊐", 8848},
+ {"⊒", 8850},
+ {"□", 9633},
+ {"□", 9633},
+ {"▪", 9642},
+ {"▪", 9642},
+ {"→", 8594},
+ {"𝓈", 120008},
+ {"∖", 8726},
+ {"⌣", 8995},
+ {"⋆", 8902},
+ {"☆", 9734},
+ {"★", 9733},
+ {"ϵ", 1013},
+ {"ϕ", 981},
+ {"¯", 175},
+ {"⊂", 8834},
+ {"⫅", 10949},
+ {"⪽", 10941},
+ {"⊆", 8838},
+ {"⫃", 10947},
+ {"⫁", 10945},
+ {"⫋", 10955},
+ {"⊊", 8842},
+ {"⪿", 10943},
+ {"⥹", 10617},
+ {"⊂", 8834},
+ {"⊆", 8838},
+ {"⫅", 10949},
+ {"⊊", 8842},
+ {"⫋", 10955},
+ {"⫇", 10951},
+ {"⫕", 10965},
+ {"⫓", 10963},
+ {"≻", 8827},
+ {"⪸", 10936},
+ {"≽", 8829},
+ {"⪰", 10928},
+ {"⪺", 10938},
+ {"⪶", 10934},
+ {"⋩", 8937},
+ {"≿", 8831},
+ {"∑", 8721},
+ {"♪", 9834},
+ {"¹", 185},
+ {"¹", 185},
+ {"²", 178},
+ {"²", 178},
+ {"³", 179},
+ {"³", 179},
+ {"⊃", 8835},
+ {"⫆", 10950},
+ {"⪾", 10942},
+ {"⫘", 10968},
+ {"⊇", 8839},
+ {"⫄", 10948},
+ {"⟉", 10185},
+ {"⫗", 10967},
+ {"⥻", 10619},
+ {"⫂", 10946},
+ {"⫌", 10956},
+ {"⊋", 8843},
+ {"⫀", 10944},
+ {"⊃", 8835},
+ {"⊇", 8839},
+ {"⫆", 10950},
+ {"⊋", 8843},
+ {"⫌", 10956},
+ {"⫈", 10952},
+ {"⫔", 10964},
+ {"⫖", 10966},
+ {"⇙", 8665},
+ {"⤦", 10534},
+ {"↙", 8601},
+ {"↙", 8601},
+ {"⤪", 10538},
+ {"ß", 223},
+ {"ß", 223},
+ {"⌖", 8982},
+ {"τ", 964},
+ {"⎴", 9140},
+ {"ť", 357},
+ {"ţ", 355},
+ {"т", 1090},
+ {"⃛", 8411},
+ {"⌕", 8981},
+ {"𝔱", 120113},
+ {"∴", 8756},
+ {"∴", 8756},
+ {"θ", 952},
+ {"ϑ", 977},
+ {"ϑ", 977},
+ {"≈", 8776},
+ {"∼", 8764},
+ {" ", 8201},
+ {"≈", 8776},
+ {"∼", 8764},
+ {"þ", 254},
+ {"þ", 254},
+ {"˜", 732},
+ {"×", 215},
+ {"×", 215},
+ {"⊠", 8864},
+ {"⨱", 10801},
+ {"⨰", 10800},
+ {"∭", 8749},
+ {"⤨", 10536},
+ {"⊤", 8868},
+ {"⌶", 9014},
+ {"⫱", 10993},
+ {"𝕥", 120165},
+ {"⫚", 10970},
+ {"⤩", 10537},
+ {"‴", 8244},
+ {"™", 8482},
+ {"▵", 9653},
+ {"▿", 9663},
+ {"◃", 9667},
+ {"⊴", 8884},
+ {"≜", 8796},
+ {"▹", 9657},
+ {"⊵", 8885},
+ {"◬", 9708},
+ {"≜", 8796},
+ {"⨺", 10810},
+ {"⨹", 10809},
+ {"⧍", 10701},
+ {"⨻", 10811},
+ {"⏢", 9186},
+ {"𝓉", 120009},
+ {"ц", 1094},
+ {"ћ", 1115},
+ {"ŧ", 359},
+ {"≬", 8812},
+ {"↞", 8606},
+ {"↠", 8608},
+ {"⇑", 8657},
+ {"⥣", 10595},
+ {"ú", 250},
+ {"ú", 250},
+ {"↑", 8593},
+ {"ў", 1118},
+ {"ŭ", 365},
+ {"û", 251},
+ {"û", 251},
+ {"у", 1091},
+ {"⇅", 8645},
+ {"ű", 369},
+ {"⥮", 10606},
+ {"⥾", 10622},
+ {"𝔲", 120114},
+ {"ù", 249},
+ {"ù", 249},
+ {"↿", 8639},
+ {"↾", 8638},
+ {"▀", 9600},
+ {"⌜", 8988},
+ {"⌜", 8988},
+ {"⌏", 8975},
+ {"◸", 9720},
+ {"ū", 363},
+ {"¨", 168},
+ {"¨", 168},
+ {"ų", 371},
+ {"𝕦", 120166},
+ {"↑", 8593},
+ {"↕", 8597},
+ {"↿", 8639},
+ {"↾", 8638},
+ {"⊎", 8846},
+ {"υ", 965},
+ {"ϒ", 978},
+ {"υ", 965},
+ {"⇈", 8648},
+ {"⌝", 8989},
+ {"⌝", 8989},
+ {"⌎", 8974},
+ {"ů", 367},
+ {"◹", 9721},
+ {"𝓊", 120010},
+ {"⋰", 8944},
+ {"ũ", 361},
+ {"▵", 9653},
+ {"▴", 9652},
+ {"⇈", 8648},
+ {"ü", 252},
+ {"ü", 252},
+ {"⦧", 10663},
+ {"⇕", 8661},
+ {"⫨", 10984},
+ {"⫩", 10985},
+ {"⊨", 8872},
+ {"⦜", 10652},
+ {"ϵ", 1013},
+ {"ϰ", 1008},
+ {"∅", 8709},
+ {"ϕ", 981},
+ {"ϖ", 982},
+ {"∝", 8733},
+ {"↕", 8597},
+ {"ϱ", 1009},
+ {"ς", 962},
+ {"⊊︀", 8842, 65024},
+ {"⫋︀", 10955, 65024},
+ {"⊋︀", 8843, 65024},
+ {"⫌︀", 10956, 65024},
+ {"ϑ", 977},
+ {"⊲", 8882},
+ {"⊳", 8883},
+ {"в", 1074},
+ {"⊢", 8866},
+ {"∨", 8744},
+ {"⊻", 8891},
+ {"≚", 8794},
+ {"⋮", 8942},
+ {"|", 124},
+ {"|", 124},
+ {"𝔳", 120115},
+ {"⊲", 8882},
+ {"⊂⃒", 8834, 8402},
+ {"⊃⃒", 8835, 8402},
+ {"𝕧", 120167},
+ {"∝", 8733},
+ {"⊳", 8883},
+ {"𝓋", 120011},
+ {"⫋︀", 10955, 65024},
+ {"⊊︀", 8842, 65024},
+ {"⫌︀", 10956, 65024},
+ {"⊋︀", 8843, 65024},
+ {"⦚", 10650},
+ {"ŵ", 373},
+ {"⩟", 10847},
+ {"∧", 8743},
+ {"≙", 8793},
+ {"℘", 8472},
+ {"𝔴", 120116},
+ {"𝕨", 120168},
+ {"℘", 8472},
+ {"≀", 8768},
+ {"≀", 8768},
+ {"𝓌", 120012},
+ {"⋂", 8898},
+ {"◯", 9711},
+ {"⋃", 8899},
+ {"▽", 9661},
+ {"𝔵", 120117},
+ {"⟺", 10234},
+ {"⟷", 10231},
+ {"ξ", 958},
+ {"⟸", 10232},
+ {"⟵", 10229},
+ {"⟼", 10236},
+ {"⋻", 8955},
+ {"⨀", 10752},
+ {"𝕩", 120169},
+ {"⨁", 10753},
+ {"⨂", 10754},
+ {"⟹", 10233},
+ {"⟶", 10230},
+ {"𝓍", 120013},
+ {"⨆", 10758},
+ {"⨄", 10756},
+ {"△", 9651},
+ {"⋁", 8897},
+ {"⋀", 8896},
+ {"ý", 253},
+ {"ý", 253},
+ {"я", 1103},
+ {"ŷ", 375},
+ {"ы", 1099},
+ {"¥", 165},
+ {"¥", 165},
+ {"𝔶", 120118},
+ {"ї", 1111},
+ {"𝕪", 120170},
+ {"𝓎", 120014},
+ {"ю", 1102},
+ {"ÿ", 255},
+ {"ÿ", 255},
+ {"ź", 378},
+ {"ž", 382},
+ {"з", 1079},
+ {"ż", 380},
+ {"ℨ", 8488},
+ {"ζ", 950},
+ {"𝔷", 120119},
+ {"ж", 1078},
+ {"⇝", 8669},
+ {"𝕫", 120171},
+ {"𝓏", 120015},
+ {"‍", 8205},
+ {"‌", 8204},
+ {nil, 0},
+};
diff --git a/tok.c b/tok.c
@@ -0,0 +1,1638 @@
+#include <u.h>
+#include <libc.h>
+#include <String.h>
+#include <thread.h>
+
+#include "html5dom.h"
+#include "ncref.h"
+
+#define ALPHA(x) ((x >=0x41) && (x <= 0x7a))
+#define DIGIT(x) ((x >=0x30) && (x <= 0x39))
+
+Channel *outchannel;
+
+int gc(void);
+
+
+
+Token*
+eoftok(void)
+{
+ Token *t;
+ t = mallocz(sizeof(Token), 1);
+ t->type = TEOF;
+ return t;
+}
+
+Token*
+chartok(Rune c)
+{
+ Token *t;
+ t = mallocz(sizeof(Token), 1);
+ t->c = c;
+ t->type = TCHAR;
+ return t;
+}
+
+Token*
+newtok(int type)
+{
+ Token *nt;
+ nt = mallocz(sizeof(Token), 1);
+ nt->type = type;
+ nt->name = s_new();
+ nt->attr = nil;
+ return nt;
+}
+
+void
+t_free(Token *t)
+{
+ s_free(t->name);
+ free(t);
+}
+
+
+Attr*
+tnewattr(Token *t)
+{
+ int n;
+ if (t->attr == nil) t->attr = mallocz(sizeof(Attr*), 1);
+ for (n=0; (t->attr)[n] != nil; n++);
+ t->attr = realloc(t->attr, (n + 1) * sizeof(Attr*));
+ t->attr[n-1] = mallocz(sizeof(Attr), 1);
+ t->attr[n-1]->name = s_new();
+ t->attr[n-1]->value = s_new();
+ t->attr[n] = nil;
+ return t->attr[n-1];
+}
+
+void
+attr_free(Attr *attr)
+{
+ s_free(attr->name);
+ s_free(attr->value);
+ free(attr);
+}
+
+u32int insertion_mode = IMinitial;
+
+/* Tokenizer vars and funcs */
+
+Rune tc;
+int treconsume = 0;
+int teof;
+
+Token *ctoken;
+Attr *cattr;
+String *ctempbuf;
+String *clookaheadbuf;
+
+void tconsume(void);
+void temit(Token*);
+void temitbuf(String*);
+int talpha(int);
+
+void tsdata(void);
+void tsrcdt(void);
+void tsrawt(void);
+void tsscript(void);
+void tsptxt(void);
+void tstagopen(void);
+void tsetagopen(void);
+void tstagname(void);
+void tsrcdtless(void);
+void tsrcdtendopen(void);
+void tsrcdtendname(void);
+void tsrawtless(void);
+void tsrawtendopen(void);
+void tsrawtendname(void);
+void tsscriptless(void);
+void tsscriptendopen(void);
+void tsscriptendname(void);
+
+void tsscriptescstart(void);
+void tsscriptescstartdash(void);
+void tsscriptesc(void);
+void tsscriptescdash(void);
+void tsscriptescddash(void);
+void tsscriptescless(void);
+void tsscriptescendopen(void);
+void tsscriptescendname(void);
+void tsscriptdescstart(void);
+void tsscriptdesc(void);
+void tsscriptdescdash(void);
+void tsscriptdescddash(void);
+void tsscriptdescless(void);
+void tsscriptdescend(void);
+
+void tsanamebefore(void);
+void tsaname(void);
+void tsanameafter(void);
+void tsavalbefore(void);
+void tsavaldq(void);
+void tsavalsq(void);
+void tsavaluq(void);
+void tsavalafter(void);
+void tsscstag(void);
+void tsboguscomment(void);
+void tsmkupopen(void);
+void tscommentstart(void);
+void tscommentstartdash(void);
+void tscomment(void);
+void tscommentless(void);
+void tscommentlessbang(void);
+void tscommentlessbangdash(void);
+void tscommentlessbangddash(void);
+void tscommentenddash(void);
+void tscommentend(void);
+void tscommentendbang(void);
+void tsdoct(void);
+void tsdoctbefore(void);
+void tsdoctname(void);
+void tsdoctnameafter(void);
+void tsdoctpubkafter(void);
+void tsdoctpubidbefore(void);
+void tsdoctpubiddq(void);
+void tsdoctpubidsq(void);
+void tsdoctpubidafter(void);
+void tsdoctbetween(void);
+void tsdoctsyskafter(void);
+void tsdoctsysidbefore(void);
+void tsdoctsysiddq(void);
+void tsdoctsysidsq(void);
+void tsdoctsysidafter(void);
+void tsdoctbogus(void);
+void tscdat(void);
+void tscdatbrk(void);
+void tscdatend(void);
+void tscref(void);
+void tsncref(void);
+void tsamam(void);
+void tsnumref(void);
+void tshexrefstart(void);
+void tsdecrefstart(void);
+void tshexref(void);
+void tsdecref(void);
+void tsnumrefend(void);
+
+
+#define REPCHAR Runeerror /* replacement character */
+
+enum {
+ TSDATA, /* data */
+ TSRCDT, /* RCDATA */
+ TSRAWT, /* RAWTEXT */
+ TSSCRIPT, /* script data */
+ TSPTXT, /* PLAINTEXT */
+ TSTAG_OPEN, /* tag open */
+ TSETAG_OPEN, /* end tag open */
+ TSTAG_NAME, /* tag name */
+ TSRCDT_LESS, /* RCDATA less-than sign */
+ TSRCDT_END_OPEN, /* RCDATA end tag open */
+ TSRCDT_END_NAME, /* RCDATA end tag name */
+ TSRAWT_LESS, /* RAWTEXT less-than sign */
+ TSRAWT_END_OPEN, /* RAWTEXT end tag open */
+ TSRAWT_END_NAME, /* RAWTEXT end tag name */
+ TSSCRIPT_LESS, /* script data less-than sign */
+ TSSCRIPT_END_OPEN, /* script data end tag open */
+ TSSCIRPT_END_NAME, /* script data end tag name */
+ TSSCRIPT_ESC_START, /* scirpt data escape start */
+ TSSCRIPT_ESC_START_DASH, /* scirpt data escape start dash */
+ TSSCRIPT_ESC, /* scirpt data escaped */
+ TSSCRIPT_ESC_DASH, /* scirpt data escaped dash */
+
+ TSSCRIPT_ESC_DDASH, /* scirpt data escaped dash dash */
+ TSSCRIPT_ESC_LESS, /* scirpt data escaped less-than sign */
+ TSSCRIPT_ESC_END_OPEN, /* scirpt data escaped end tag open */
+ TSSCRIPT_ESC_END_NAME, /* scirpt data escaped end tag name */
+ TSSCRIPT_DESC_START, /* scirpt data double escape start */
+ TSSCRIPT_DESC, /* scirpt data double escaped */
+ TSSCRIPT_DESC_DASH, /* scirpt data double escaped dash */
+ TSSCRIPT_DESC_DDASH, /* scirpt data double escaped dash dash */
+ TSSCRIPT_DESC_LESS, /* scirpt data double escaped less-than sign */
+ TSSCRIPT_DESC_END, /* scirpt data double escape end */
+
+ TSANAME_BEFORE, /* Before attribute name */
+ TSANAME, /* Attribute name */
+ TSANAME_AFTER, /* After attribute name */
+ TSAVAL_BEFORE, /* Before attribute value */
+ TSAVAL_DQ, /* Attribute value (double-quoted) */
+ TSAVAL_SQ, /* Attribute value (single-quoted) */
+ TSAVAL_UQ, /* Attribute value (unquoted) */
+ TSAVAL_AFTER, /* After attribute value (quoted) */
+
+ TSSCSTAG, /* Self-closing start tag */
+ TSBOGUS_COMMENT, /* Bogus comment */
+ TSMKUP_OPEN, /* Markup declaration open */
+
+ TSCOMMENT_START, /* Comment start */
+ TSCOMMENT_START_DASH, /* Comment start dash */
+ TSCOMMENT, /* Comment */
+ TSCOMMENT_LESS, /* Comment less-than sign */
+ TSCOMMENT_LESS_BANG, /* Comment less-than sign bang */
+ TSCOMMENT_LESS_BANG_DASH, /* Comment less-than sign bang dash */
+ TSCOMMENT_LESS_BANG_DDASH, /* Comment less-than sign bang dash dash */
+ TSCOMMENT_END_DASH, /* Comment end dash */
+ TSCOMMENT_END, /* Comment end */
+ TSCOMMENT_END_BANG, /* Comment end bang */
+
+ TSDOCT, /* DOCTYPE */
+ TSDOCT_BEFORE, /* Before DOCTYPE name */
+ TSDOCT_NAME, /* DOCTYPE name */
+ TSDOCT_NAME_AFTER, /* After DOCTYPE name */
+ TSDOCT_PUBK_AFTER, /* After DOCTYPE public keyword */
+ TSDOCT_PUBID_BEFORE, /* Before DOCTYPE public identifier */
+ TSDOCT_PUBID_DQ, /* DOCTYPE public identifier (double-quoted) */
+ TSDOCT_PUBID_SQ, /* DOCTYPE public identifier (single-quoted) */
+ TSDOCT_PUBID_AFTER, /* After DOCTYPE public identifier */
+ TSDOCT_BETWEEN, /* Between DOCTYPE public and system identifiers */
+ TSDOCT_SYSK_AFTER, /* After DOCTYPE system keyword */
+ TSDOCT_SYSID_BEFORE, /* Before DOCTYPE system identifier */
+ TSDOCT_SYSID_DQ, /* DOCTYPE system identifier (double-quoted) */
+ TSDOCT_SYSID_SQ, /* DOCTYPE system identifier (single-quoted) */
+ TSDOCT_SYSID_AFTER, /* After DOCTYPE system identifier */
+ TSDOCT_BOGUS, /* Bogus DOCTYPE */
+
+ TSCDAT, /* CDATA section */
+ TSCDAT_BRK, /* CDATA section bracket */
+ TSCDAT_END, /* CDATA section end */
+
+ TSCREF, /* Character reference */
+ TSNCREF, /* Named character reference */
+ TSAMAM, /* Ambiguous ampersand */
+ TSNUMREF, /* Numeric character reference */
+ TSHEXREF_START, /* Hexadecimal character reference start */
+ TSDECREF_START, /* Decimal character reference start */
+ TSHEXREF, /* Hexadecimal character reference */
+ TSDECREF, /* Decimal character reference */
+ TSNUMREF_END, /* Numeric character reference end */
+
+ TMAX,
+};
+
+void (*tstab[])(void) = {
+ [TSDATA] = tsdata,
+ [TSRCDT] = tsrcdt,
+ [TSRAWT] = tsrawt,
+ [TSSCRIPT] = tsscript,
+ [TSPTXT] = tsptxt,
+ [TSTAG_OPEN] = tstagopen,
+ [TSETAG_OPEN] = tsetagopen,
+ [TSTAG_NAME] = tstagname,
+ [TSRCDT_LESS] = tsrcdtless,
+ [TSRCDT_END_OPEN] = tsrcdtendopen,
+ [TSRCDT_END_NAME] = tsrcdtendname,
+ [TSRAWT_LESS] = tsrawtless,
+ [TSRAWT_END_OPEN] = tsrawtendopen,
+ [TSSCRIPT_LESS] = tsscriptless,
+ [TSSCRIPT_END_OPEN] = tsscriptendopen,
+ [TSSCIRPT_END_NAME] = tsscriptendname,
+ [TSSCRIPT_ESC_START] = tsscriptesc,
+ [TSSCRIPT_ESC_START_DASH] = tsscriptesc,
+ [TSSCRIPT_ESC] = tsscriptesc,
+ [TSSCRIPT_ESC_DASH] = tsscriptescdash,
+ [TSSCRIPT_ESC_DDASH] = tsscriptescddash,
+ [TSSCRIPT_ESC_LESS] = tsscriptescless,
+ [TSSCRIPT_ESC_END_OPEN] = tsscriptescendopen,
+ [TSSCRIPT_ESC_END_NAME] = tsscriptescendname,
+ [TSSCRIPT_DESC_START] = tsscriptdescstart,
+ [TSSCRIPT_DESC] = tsscriptdesc,
+ [TSSCRIPT_DESC_DASH] = tsscriptdescdash,
+ [TSSCRIPT_DESC_DDASH] = tsscriptdescddash,
+ [TSSCRIPT_DESC_LESS] = tsscriptdescless,
+ [TSSCRIPT_DESC_END] = tsscriptdescend,
+
+ [TSANAME_BEFORE] = tsanamebefore,
+ [TSANAME] = tsaname,
+ [TSANAME_AFTER] = tsanameafter,
+ [TSAVAL_BEFORE] = tsavalbefore,
+ [TSAVAL_DQ] = tsavaldq,
+ [TSAVAL_SQ] = tsavalsq,
+ [TSAVAL_UQ] = tsavaluq,
+ [TSAVAL_AFTER] = tsavalafter,
+ [TSSCSTAG] = tsscstag,
+ [TSBOGUS_COMMENT] = tsboguscomment,
+ [TSMKUP_OPEN] = tsmkupopen,
+ [TSCOMMENT_START] = tscommentstart,
+ [TSCOMMENT_START_DASH] = tscommentstartdash,
+ [TSCOMMENT] = tscomment,
+ [TSCOMMENT_LESS] = tscommentless,
+ [TSCOMMENT_LESS_BANG] = tscommentlessbang,
+ [TSCOMMENT_LESS_BANG_DASH] = tscommentlessbangdash,
+ [TSCOMMENT_LESS_BANG_DDASH] = tscommentlessbangddash,
+ [TSCOMMENT_END_DASH] = tscommentenddash,
+ [TSCOMMENT_END] = tscommentend,
+ [TSCOMMENT_END_BANG] = tscommentendbang,
+ [TSDOCT] = tsdoct,
+ [TSDOCT_BEFORE] = tsdoctbefore,
+ [TSDOCT_NAME] = tsdoctname,
+ [TSDOCT_NAME_AFTER] = tsdoctnameafter,
+ [TSDOCT_PUBK_AFTER] = tsdoctpubkafter,
+ [TSDOCT_PUBID_BEFORE] = tsdoctpubidbefore,
+ [TSDOCT_PUBID_DQ] = tsdoctpubiddq,
+ [TSDOCT_PUBID_SQ] = tsdoctpubidsq,
+ [TSDOCT_PUBID_AFTER] = tsdoctpubidafter,
+ [TSDOCT_BETWEEN] = tsdoctbetween,
+ [TSDOCT_SYSK_AFTER] = tsdoctsyskafter,
+ [TSDOCT_SYSID_BEFORE] = tsdoctsysidbefore,
+ [TSDOCT_SYSID_DQ] = tsdoctsysiddq,
+ [TSDOCT_SYSID_SQ] = tsdoctsysidsq,
+ [TSDOCT_SYSID_AFTER] = tsdoctsysidafter,
+ [TSDOCT_BOGUS] = tsdoctbogus,
+ [TSCDAT] = tscdat,
+ [TSCDAT_BRK] = tscdatbrk,
+ [TSCDAT_END] = tscdatend,
+ [TSCREF] = tscref,
+ [TSNCREF] = tsncref,
+ [TSAMAM] = tsamam,
+ [TSNUMREF] = tsnumref,
+ [TSHEXREF_START] = tshexrefstart,
+ [TSDECREF_START] = tsdecrefstart,
+ [TSHEXREF] = tshexref,
+ [TSDECREF] = tsdecref,
+ [TSNUMREF_END] = tsnumrefend,
+};
+
+int tstate = TSDATA;
+int treturn = -1;
+
+void
+tsanamebefore(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ break;
+ case '/':
+ case '>':
+ case -1:
+ treconsume = 1;
+ tstate = TSANAME_AFTER;
+ break;
+ case '=':
+ fprint(2, "unexpected equals sign before attribute name parse error, tc='%c'\n", tc);
+ cattr = tnewattr(ctoken);
+ s_putc(cattr->name, tc);
+ tstate = TSANAME;
+ break;
+ default:
+ cattr = tnewattr(ctoken);
+ treconsume = 1;
+ tstate = TSANAME;
+ }
+}
+
+void
+tsaname(void)
+{
+ if (ALPHA(tc) != 0) {
+ if (tc < 'a') tc += 0x20;
+ }
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ case '/':
+ case '>':
+ case -1:
+ treconsume = 1;
+ s_terminate(cattr->name);
+ tstate = TSANAME_AFTER;
+ break;
+ case '=':
+ tstate = TSAVAL_BEFORE;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ s_putc(cattr->name, REPCHAR);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ fprint(2, "unexpected character in attribute name parse error, tc='%c'\n", tc);
+ default:
+ s_putc(cattr->name, tc);
+ }
+ /* TODO check for duplicate attribute names on leaving or emitting */
+}
+
+void
+tsanameafter(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ break;
+ case '/':
+ tstate = TSSCSTAG;
+ break;
+ case '=':
+ tstate = TSAVAL_BEFORE;
+ break;
+ case '>':
+ tstate = TSDATA;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ break;
+ case -1: /* EOF */
+ fprint(2, "eof in tag parse error\n");
+ temit(eoftok());
+ break;
+ default:
+ cattr = tnewattr(ctoken);
+ treconsume = 1;
+ tstate = TSANAME;
+ }
+}
+
+void
+tsavalbefore(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ break;
+ case '"':
+ tstate = TSAVAL_DQ;
+ break;
+ case '\'':
+ tstate = TSAVAL_SQ;
+ break;
+ case '>':
+ fprint(2, "missing attribute value parse error\n");
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ tstate = TSDATA;
+ break;
+ default:
+ treconsume = 1;
+ tstate = TSAVAL_UQ;
+ }
+}
+
+void
+tsavaldq(void)
+{
+ switch (tc) {
+ case '"':
+ tstate = TSAVAL_AFTER;
+ break;
+ case '&':
+ treturn = TSAVAL_DQ;
+ tstate = TSCREF;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error\n");
+ s_putc(cattr->value, REPCHAR);
+ break;
+ case -1: /* EOF */
+ fprint(2, "oef in tag parse error\n");
+ temit(eoftok());
+ break;
+ default:
+ s_putc(cattr->value, tc);
+ }
+}
+
+void
+tsavalsq(void)
+{
+ switch (tc) {
+ case '\'':
+ tstate = TSAVAL_AFTER;
+ break;
+ case '&':
+ treturn = TSAVAL_SQ;
+ tstate = TSCREF;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error\n");
+ s_putc(cattr->value, REPCHAR);
+ break;
+ case -1: /* EOF */
+ fprint(2, "oef in tag parse error\n");
+ temit(eoftok());
+ break;
+ default:
+ s_putc(cattr->value, tc);
+ }
+}
+
+void
+tsavaluq(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ s_terminate(cattr->value);
+ tstate = TSANAME_BEFORE;
+ break;
+ case '&':
+ treturn = TSAVAL_UQ;
+ tstate = TSCREF;
+ break;
+ case '>':
+ s_terminate(ctoken->name);
+ s_terminate(cattr->value);
+ tstate = TSDATA;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error\n");
+ s_putc(cattr->value, REPCHAR);
+ break;
+ case -1: /* EOF */
+ fprint(2, "oef in tag parse error\n");
+ temit(eoftok());
+ break; case '"':
+ case '\'':
+ case '<':
+ case '=':
+ case '`':
+ fprint(2, "unexpected character in unquoted attribute value parse error\n");
+ default:
+ s_putc(cattr->value, tc);
+ }
+}
+
+void
+tsavalafter(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSANAME_BEFORE;
+ break;
+ case '/':
+ ctoken->flags |= TSSCSTAG;
+ break;
+ case '>':
+ s_terminate(ctoken->name);
+ s_terminate(cattr->value);
+ temit(ctoken);
+ tstate = TSDATA;
+ break;
+ case -1: /* EOF */
+ fprint(2, "eof in tag parse error\n");
+ temit(eoftok());
+ break;
+ default:
+ fprint(2, "missing whitespace between attributes parse error\n");
+ treconsume = 1;
+ tstate = TSANAME_BEFORE;
+ }
+}
+
+void
+tsscstag(void)
+{
+ switch (tc) {
+ case '>':
+ ctoken->flags |= TF_SELF_CLOSING;
+ tstate = TSDATA;
+ temit(ctoken);
+ break;
+ case -1:
+ fprint(2, "eof in tag parse error\n");
+ temit(eoftok());
+ break;
+ default:
+ fprint(2, "unxpected solidus in tag parse error\n");
+ treconsume = 1;
+ tstate = TSANAME_BEFORE;
+ }
+}
+
+void
+tsboguscomment(void)
+{
+ fprint(2, "tsboguscomment not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsmkupopen(void)
+{
+ int i;
+ String *mbuf, *lowered;
+ mbuf = s_new();
+ s_putc(mbuf, tc);
+ tconsume();
+ s_putc(mbuf, tc);
+ if (strncmp(s_to_c(mbuf), "--", 2) == 0) {
+ ctoken = newtok(TCOMM);
+ tstate = TSCOMMENT_START;
+ s_free(mbuf);
+ return;
+ }
+ for (i = 0; i < 5; i++) {
+ tconsume();
+ s_putc(mbuf, tc);
+ }
+ if (strncmp(s_to_c(mbuf), "[CDATA[", 7) == 0) {
+ /* TODO: check if adjusted current node */
+ tstate = TSCDAT;
+ s_free(mbuf);
+ return;
+ }
+ lowered = s_copy(s_to_c(mbuf));
+ s_tolower(lowered);
+ if (strncmp(s_to_c(lowered), "doctype", 7) == 0) {
+ tstate = TSDOCT;
+ s_free(mbuf);
+ s_free(lowered);
+ return;
+ }
+ fprint(2, "incorrectly opened comment parse error, tc='%c'\n", tc);
+ ctoken = newtok(TCOMM);
+ tstate = TSBOGUS_COMMENT;
+ s_append(clookaheadbuf, s_to_c(mbuf));
+ s_free(lowered);
+ s_free(mbuf);
+}
+
+void
+tscommentstart(void)
+{
+ fprint(2, "tscommentstart not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentstartdash(void)
+{
+ fprint(2, "tscommentstartdash not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscomment(void)
+{
+ fprint(2, "tscomment not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentless(void)
+{
+ fprint(2, "tscommentless not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentlessbang(void)
+{
+ fprint(2, "tscommentlessbang not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentlessbangdash(void)
+{
+ fprint(2, "tscommentlessbangdash not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentlessbangddash(void)
+{
+ fprint(2, "tscommentlessbangddash not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentenddash(void)
+{
+ fprint(2, "tscommentenddash not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentend(void)
+{
+ fprint(2, "tscommentend not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscommentendbang(void)
+{
+ fprint(2, "tscommentendbang not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoct(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSDOCT_BEFORE;
+ break;
+ case '>':
+ treconsume = 1;
+ tstate = TSDOCT_BEFORE;
+ break;
+ case -1: /* eof */
+ fprint(2, "eof in doctype parse error, tc='%c'\n", tc);
+ ctoken = newtok(TDOCT);
+ ctoken->flags |= TF_FORCE_QUIRKS;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ break;
+ default:
+ fprint(2, "missing whitespace before doctype name parse error, tc='%c'\n", tc);
+ treconsume = 1;
+ tstate = TSDOCT_BEFORE;
+ }
+}
+
+void
+tsdoctbefore(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ ctoken = newtok(TDOCT);
+ s_putc(ctoken->name, REPCHAR);
+ tstate = TSDOCT_NAME;
+ break;
+ case '>':
+ fprint(2, "missing doctype name parse error, tc='%c'\n", tc);
+ ctoken = newtok(TDOCT);
+ ctoken->flags |= TF_FORCE_QUIRKS;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ break;
+ case -1: /* EOF */
+ fprint(2, "eof in doctype parse error, tc='%c'\n", tc);
+ ctoken = newtok(TDOCT);
+ ctoken->flags |= TF_FORCE_QUIRKS;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ temit(eoftok());
+ break;
+ default:
+ if (tc < 'a') tc += 0x20;
+ ctoken = newtok(TDOCT);
+ s_putc(ctoken->name, tc);
+ tstate = TSDOCT_NAME;
+ }
+}
+
+void
+tsdoctname(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSDOCT_NAME_AFTER;
+ break;
+ case '>':
+ tstate = TSDATA;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error\n");
+ s_putc(ctoken->name, REPCHAR);
+ break;
+ case -1: /* EOF */
+ fprint(2, "eof in doctype parse error\n");
+ ctoken->flags |= TF_FORCE_QUIRKS;
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ temit(eoftok());
+ break;
+ default:
+ talpha(1);
+ }
+}
+
+void
+tsdoctnameafter(void)
+{
+ fprint(2, "tsdoctnameafter not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctpubkafter(void)
+{
+ fprint(2, "tsdoctpubkafter not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctpubidbefore(void)
+{
+ fprint(2, "tsdoctpubidbefore not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctpubiddq(void)
+{
+ fprint(2, "tsdoctpubiddq not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctpubidsq(void)
+{
+ fprint(2, "tsdoctpubidsq not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctpubidafter(void)
+{
+ fprint(2, "tsdoctpubidafter not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctbetween(void)
+{
+ fprint(2, "tsdoctbetween not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctsyskafter(void)
+{
+ fprint(2, "tsdoctsyskafter not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctsysidbefore(void)
+{
+ fprint(2, "tsdoctsysidbefore not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctsysiddq(void)
+{
+ fprint(2, "tsdoctsysiddq not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctsysidsq(void)
+{
+ fprint(2, "tsdoctsysidsq not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctsysidafter(void)
+{
+ fprint(2, "tsdoctsysidafter not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdoctbogus(void)
+{
+ fprint(2, "tsdoctbogus not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscdat(void)
+{
+ fprint(2, "tscdat not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscdatbrk(void)
+{
+ fprint(2, "tscdatbrk not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscdatend(void)
+{
+ fprint(2, "tscdatend not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tscref(void)
+{
+ if ((ALPHA(tc)) || (DIGIT(tc))) {
+ treconsume = 1;
+ tstate = TSNCREF;
+ return;
+ }
+ switch (tc) {
+ case '#':
+ s_putc(ctempbuf, tc);
+ tstate = TSNUMREF;
+ break;
+ default:
+ treconsume = 1;
+ s_terminate(ctempbuf);
+ s_append(cattr->value, s_to_c(ctempbuf));
+ s_reset(ctempbuf);
+ tstate = treturn;
+ }
+ fprint(2, "tscref not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsncref(void)
+{
+ fprint(2, "tsncref not implemented\n");
+ tstate = treturn;
+}
+
+void
+tsamam(void)
+{
+ fprint(2, "tsamam not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsnumref(void)
+{
+ fprint(2, "tsnumref not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tshexrefstart(void)
+{
+ fprint(2, "tshexrefstart not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdecrefstart(void)
+{
+ fprint(2, "tsdecrefstart not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tshexref(void)
+{
+ fprint(2, "tshexref not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsdecref(void)
+{
+ fprint(2, "tsdecref not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsnumrefend(void)
+{
+ fprint(2, "tsnumrefend not implemented\n");
+ tstate = TSDATA;
+}
+
+void
+tsscriptendname(void)
+{
+ if (talpha(1) != 0) return;
+ if (1 /* appropriate end tag token */) {
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSANAME_BEFORE;
+ break;
+ case '/':
+ tstate = TSSCSTAG;
+ break;
+ case '>':
+ tstate = TSDATA;
+ break;
+ }
+ } else {
+ temit(chartok('<'));
+ temit(chartok('/'));
+ temitbuf(ctempbuf);
+ }
+}
+
+
+void
+tsscriptescstart(void)
+{
+ if (tc == '-') {
+ tstate = TSSCRIPT_ESC_START_DASH;
+ temit(chartok('-'));
+ } else {
+ treconsume = 1;
+ tstate = TSSCRIPT;
+ }
+}
+
+
+void
+tsscriptescstartdash(void)
+{
+ if (tc == '-') {
+ tstate = TSSCRIPT_ESC_DDASH;
+ temit(chartok('-'));
+ } else {
+ treconsume = 1;
+ tstate = TSSCRIPT;
+ }
+}
+
+
+void
+tsscriptesc(void)
+{
+ switch (tc) {
+ case '-':
+ tstate = TSSCRIPT_ESC_DASH;
+ temit(chartok('-'));
+ break;
+ case '<':
+ tstate = TSSCRIPT_ESC_LESS;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(REPCHAR));
+ break;
+ case -1: /* EOF */
+ fprint(2, "eof in scipt html comment like text parse error, tc='%c'\n", tc);
+ temit(eoftok());
+ default:
+ temit(chartok(tc));
+ }
+}
+
+
+void
+tsscriptescdash(void)
+{
+ switch (tc) {
+ case '-':
+ tstate = TSSCRIPT_ESC_DDASH;
+ temit(chartok('-'));
+ break;
+ case '<':
+ tstate = TSSCRIPT_ESC_LESS;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ tstate = TSSCRIPT_ESC;
+ temit(chartok(REPCHAR));
+ break;
+ case -1:
+ fprint(2, "eof in script html comment like text parse error, tc='%c'\n", tc);
+ temit(eoftok());
+ break;
+ default:
+ tstate = TSSCRIPT_ESC;
+ temit(chartok(tc));
+ }
+}
+
+
+void
+tsscriptescddash(void)
+{
+ fprint(2, "tsscriptescddash not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptescless(void)
+{
+ fprint(2, "tsscriptescless not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptescendopen(void)
+{
+ fprint(2, "tsscriptescendopen not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptescendname(void)
+{
+ fprint(2, "tsscriptescendname not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdescstart(void)
+{
+ fprint(2, "tsscriptdescstart not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdesc(void)
+{
+ fprint(2, "tsscriptdesc not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdescdash(void)
+{
+ fprint(2, "tsscriptdescdash not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdescddash(void)
+{
+ fprint(2, "tsscriptdescddash not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdescless(void)
+{
+ fprint(2, "tsscriptdescless not implemented\n");
+ tstate = TSDATA;
+}
+
+
+void
+tsscriptdescend(void)
+{
+ fprint(2, "tsscriptdescend not implemented\n");
+ tstate = TSDATA;
+}
+
+
+
+void
+tsscriptendopen(void)
+{
+ if (ALPHA(tc) != 0) {
+ treconsume = 1;
+ tstate = TSSCIRPT_END_NAME;
+ } else {
+ temit(chartok('<'));
+ temit(chartok('/'));
+ treconsume = 1;
+ tstate = TSDATA;
+ }
+}
+
+void
+tsscriptless(void)
+{
+ switch (tc) {
+ case '/':
+ s_reset(ctempbuf);
+ tstate = TSSCRIPT_END_OPEN;
+ break;
+ case '!':
+ tstate = TSSCRIPT_ESC_START;
+ temit(chartok('<'));
+ temit(chartok('!'));
+ break;
+ default:
+ temit(chartok('<'));
+ treconsume = 1;
+ tstate = TSSCRIPT;
+ }
+}
+
+void
+tsrawtendname(void)
+{
+ if (ALPHA(tc) != 0) {
+ if (tc < 'a') tc+= 0x20;
+
+ } else if (1 /* appropriate end tag token */ ) {
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSANAME_BEFORE;
+ break;
+ case '/':
+ tstate = TSSCSTAG;
+ break;
+ case '>':
+ tstate = TSDATA;
+ break;
+ }
+ } else {
+ temit(chartok('<'));
+ temit(chartok('/'));
+ temitbuf(ctempbuf);
+ treconsume = 1;
+ tstate = TSRAWT;
+ }
+}
+
+void
+tsrawtendopen(void)
+{
+ if (ALPHA(tc) != 0) {
+ ctoken = newtok(TEND);
+ treconsume = 1;
+ tstate = TSRAWT;
+ } else {
+ temit(chartok('<'));
+ temit(chartok('/'));
+ treconsume = 1;
+ }
+}
+
+void
+tsrawtless(void)
+{
+ if (tc == '/') {
+ s_reset(ctempbuf);
+ tstate = TSRAWT_END_OPEN;
+ } else {
+ temit(chartok('<'));
+ treconsume = 1;
+ }
+}
+
+void
+tsrcdtendname(void)
+{
+ if (talpha (1) != 0) return;
+ if ( 1 /* appropriate end tag token ??? */) {
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ tstate = TSANAME_BEFORE;
+ break;
+ case '/':
+ tstate = TSSCSTAG;
+ break;
+ case '>':
+ tstate = TSDATA;
+ temit(chartok(tc));
+ }
+ } else {
+ temit(chartok('<'));
+ temit(chartok('/'));
+ temitbuf(ctempbuf);
+ treconsume = 1;
+ tstate = TSRCDT;
+ }
+}
+
+void
+tsrcdtendopen(void)
+{
+ if (ALPHA(tc) != 0) {
+ ctoken = newtok(TEND);
+ treconsume = 1;
+ tstate = TSRCDT_END_NAME;
+ } else {
+ treconsume = 1;
+ temit(chartok('<'));
+ temit(chartok('/'));
+ }
+}
+
+void
+tsrcdtless(void)
+{
+ switch (tc) {
+ case '/':
+ s_reset(ctempbuf);
+ tstate = TSRCDT_END_OPEN;
+ break;
+ default:
+ treconsume = 1;
+ temit(chartok('<'));
+ }
+}
+
+void
+tstagname(void)
+{
+ switch (tc) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ s_terminate(ctoken->name);
+ tstate = TSANAME_BEFORE;
+ break;
+ case '/':
+ s_terminate(ctoken->name);
+ tstate = TSSCSTAG;
+ break;
+ case '>':
+ s_terminate(ctoken->name);
+ temit(ctoken);
+ tstate = TSDATA;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ s_putc(ctoken->name, REPCHAR);
+ break;
+ case -1:
+ fprint(2, "eof in tag parse error, tc='%c'\n", tc);
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ talpha(1);
+ }
+}
+
+void
+tsetagopen(void)
+{
+ if (ALPHA(tc) != 0) {
+ ctoken = newtok(TEND);
+ treconsume = 1;
+ tstate = TSTAG_NAME;
+ } else switch (tc) {
+ case '>':
+ fprint(2, "missing end tag name parse error, tc='%c'\n", tc);
+ tstate = TSDATA;
+ break;
+ case -1:
+ fprint(2, "eof before tag name parse error, tc='%c'\n", tc);
+ temit(chartok('<'));
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc);
+ ctoken = newtok(TCOMM);
+ treconsume = 1;
+ tstate = TSBOGUS_COMMENT;
+ }
+}
+
+void
+tstagopen(void)
+{
+ if (ALPHA(tc) != 0) {
+ ctoken = newtok(TSTART);
+ treconsume = 1;
+ tstate = TSTAG_NAME;
+ } else switch (tc) {
+ case '!':
+ tstate = TSMKUP_OPEN;
+ break;
+ case '/':
+ tstate = TSETAG_OPEN;
+ break;
+ case '?':
+ fprint(2, "unexpected question mark instead of tag name parse error, tc='%c'\n", tc);
+ ctoken = newtok(TCOMM);
+ treconsume = 1;
+ tstate = TSBOGUS_COMMENT;
+ break;
+ case -1:
+ fprint(2, "eof before tag name parse error");
+ temit(chartok('<'));
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ fprint(2, "invalid first character of tag name parse error, tc='%c'\n", tc);
+ temit(chartok('<'));
+ treconsume = 1;
+ tstate = TSDATA;
+ }
+}
+
+void
+tsptxt(void)
+{
+ switch (tc) {
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(REPCHAR));
+ break;
+ case -1: /* EOF */
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ temit(chartok(tc));
+ }
+}
+
+void
+tsscript(void)
+{
+ switch (tc) {
+ case '<':
+ tstate = TSSCRIPT_LESS;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(REPCHAR));
+ break;
+ case -1: /* EOF */
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ temit(chartok(tc));
+ }
+}
+
+void
+tsrawt(void)
+{
+ switch (tc) {
+ case '<':
+ tstate = TSRAWT_LESS;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(REPCHAR));
+ break;
+ case -1: /* EOF */
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ temit(chartok(tc));
+ }
+}
+
+void
+tsrcdt(void)
+{
+ switch (tc) {
+ case '&':
+ treturn = TSRCDT;
+ tstate = TSCREF;
+ break;
+ case '<':
+ tstate = TSRCDT_LESS;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(REPCHAR));
+ break;
+ case -1: /* EOF */
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ temit(chartok(tc));
+ }
+}
+
+void
+tsdata(void)
+{
+ switch (tc) {
+ case '&':
+ treturn = TSDATA;
+ tstate = TSCREF;
+ break;
+ case '<':
+ tstate = TSTAG_OPEN;
+ break;
+ case '\0':
+ fprint(2, "unexpected null character parse error, tc='%c'\n", tc);
+ temit(chartok(tc));
+ break;
+ case -1: /* EOF */
+ teof = 1;
+ temit(eoftok());
+ break;
+ default:
+ temit(chartok(tc));
+ }
+}
+
+int
+talpha(int tolower)
+{
+ if (ALPHA(tc) == 0) return 0;
+ s_putc(ctempbuf, tc);
+ if ((tolower != 0) && (tc < 'a')) tc+=0x20;
+ s_putc(ctoken->name, tc);
+ return 1;
+}
+
+void
+tconsume(void)
+{
+ char *buf;
+ if (treconsume != 0) {
+ treconsume = 0;
+ return;
+ }
+ buf = s_to_c(clookaheadbuf);
+ if (buf[0] != '\0') {
+ tc = buf[0];
+ print("tc = %uX\n", tc);
+ /* TODO make this code utf-aware */
+ String *shift;
+ shift = s_copy(buf+1);
+ s_free(clookaheadbuf);
+ clookaheadbuf = shift;
+ }
+ else tc = gc();
+}
+
+void
+temitbuf(String *str)
+{
+ Rune r;
+ char *buf;
+ int n, len;
+ buf = s_to_c(str);
+ len = strlen(buf);
+ for (n = 0; n < len; n += chartorune(&r, buf+n)){
+ temit(chartok(r));
+ }
+
+}
+
+void
+temit(Token *t)
+{
+ send(outchannel, &t);
+}
+
+int
+gc(void) /* getchar func name is reserved by stdio.h */
+{
+ #define GCBUF 1024
+ static char buf[GCBUF], *bp=buf+1;
+ static long n = 0;
+ if (bp > buf+n-1){
+ n = read(0, buf, GCBUF);
+ if (n <= 0) return -1;
+ bp = buf;
+ }
+ bp++;
+ return *(bp-1);
+}
+
+void
+threadtokenize(void *v)
+{
+ Tokctl *tc;
+ tc = v;
+ outchannel = tc->c;
+ teof = 0;
+ threadsetname("tokenizer");
+ ctempbuf = s_new();
+ clookaheadbuf = s_new();
+ while (teof == 0) {
+ if (tstate >= TMAX) {
+ fprint(2, "[TOKENIZER] unknown tstate %d\n", tstate);
+ break;
+ }
+ tconsume();
+ tstab[tstate]();
+ }
+}
diff --git a/tree.c b/tree.c
@@ -0,0 +1,103 @@
+#include <u.h>
+#include <libc.h>
+#include <String.h>
+#include <thread.h>
+
+#include "html5dom.h"
+
+Treeconstrctl *tctl;
+String *tstr;
+char *tnode;
+
+void
+nwrite(char *strnode, char *strfile, char *data, long n)
+{
+ int fd;
+ char *path;
+ path = smprint("%s/%s/%s", tctl->treeroot, strnode, strfile);
+ fd = create(path, OWRITE, 0);
+ if (fd < 0) sysfatal("failed to create %s, %r", path);
+ free(path);
+ write(fd, data, n);
+ write(fd, "\n", 1);
+ close(fd);
+}
+
+char*
+newnode(void)
+{
+ int fd;
+ long n;
+ char *strnew, *strnode;
+ strnode = mallocz(64, 1);
+ strnew = smprint("%s/new", tctl->treeroot);
+ fd = open(strnew, OREAD);
+ if (fd < 0) sysfatal("failed to open %s, %r", strnew);
+ free(strnew);
+ n = read(fd, strnode, 64);
+ close(fd);
+ if (strnode[n-1] == '\n') strnode[n-1] = '\0';
+ return strnode;
+}
+
+void
+pushchar(Rune c)
+{
+ if (tnode == nil) {
+ tnode = newnode();
+ nwrite(tnode, "type", "text", 4);
+ tstr = s_new();
+ }
+ s_putc(tstr, c);
+}
+
+void
+pushtext(void)
+{
+ s_terminate(tstr);
+ nwrite(tnode, "text", s_to_c(tstr), strlen(s_to_c(tstr)));
+ s_free(tstr);
+ tstr = nil;
+ tnode = nil;
+}
+
+void
+threadtreeconstr(void *v)
+{
+ char *strnode;
+ int teof;
+ Token *tok;
+ teof = 0;
+ tctl = v;
+ tok = nil;
+ threadsetname("treeconstr");
+ while(teof == 0){
+ recv(tctl->in, &tok);
+ switch(tok->type){
+ case TDOCT:
+ strnode = newnode();
+ nwrite(strnode, "type", "doctype", 7);
+ nwrite(strnode, "name", s_to_c(tok->name),
+ strlen(s_to_c(tok->name)));
+ free(strnode);
+ break;
+ case TSTART:
+ if (tnode != nil) pushtext();
+ strnode = newnode();
+ nwrite(strnode, "type", "element", 7);
+ nwrite(strnode, "name", s_to_c(tok->name),
+ strlen(s_to_c(tok->name)));
+ free(strnode);
+ break;
+ case TEND:
+ if (tnode != nil) pushtext();
+ break;
+ case TCHAR:
+ pushchar(tok->c);
+ break;
+ case TEOF:
+ teof = 1;
+ }
+ t_free(tok);
+ }
+}