diff --git a/documentation/tokenator.zig b/documentation/tokenator.zig index 5c51af7..954b4a3 100644 --- a/documentation/tokenator.zig +++ b/documentation/tokenator.zig @@ -8,37 +8,83 @@ const mem = std.mem; const fs = std.fs; const print = std.debug.print; -fn write_escaped(out: anytype, input: []const u8) !void { - var comment = false; +inline fn escape_char(out: anytype, char: u8) !void { + return try switch (char) { + '&' => out.writeAll("&"), + '<' => out.writeAll("<"), + '>' => out.writeAll(">"), + '"' => out.writeAll("""), + else => out.writeByte(char), + }; +} - for (input, 0..) |c, idx| { - switch (c) { - '&' => try out.writeAll("&"), - '<' => try out.writeAll("<"), - '>' => try out.writeAll(">"), - '"' => try out.writeAll("""), - '\n' => { - if (comment) { - try out.writeAll(""); - comment = false; - } - try out.writeAll("\n"); +fn write_escaped(out: anytype, input: []const u8, class: TokenClass) !void { + if (class == .whitespace) { + try write_whitespace(out, input); + } else { + for (input) |c| try escape_char(out, c); + } +} + +fn write_whitespace(out: anytype, input: []const u8) !void { + var state: enum { normal, maybe_comment, maybe_docstring, comment } = .normal; + + for (input) |c| { + switch (state) { + .normal => switch (c) { + '/' => state = .maybe_comment, + '\n' => try out.writeAll("\n"), + else => try escape_char(out, c), }, - '/' => { - if (input[idx + 1] == '/') { - try out.writeAll(""); - comment = true; - } - try out.writeByte('/'); + .maybe_comment => switch (c) { + '/' => { + state = .maybe_docstring; + }, + '\n' => { + try out.writeAll("\n"); + state = .normal; + }, + else => { + try out.writeByte('/'); + try escape_char(out, c); + state = .normal; + }, + }, + .maybe_docstring => switch (c) { + '\n' => { + // actually it was an empty comment lol cool + try out.writeAll("//\n"); + state = .normal; + }, + '/', '!' => { + // it is a docstring, so don't respan it + try out.writeAll("//"); + try out.writeByte(c); + state = .normal; + }, + else => { + // this is also a comment + try out.writeAll("//"); + try escape_char(out, c); + state = .comment; + }, + }, + .comment => switch (c) { + '\n' => { + try out.writeAll("\n"); + state = .normal; + }, + else => { + try escape_char(out, c); + }, }, - else => try out.writeByte(c), } } } // TODO: use more context to get better token resolution // identifier preceded by (break | continue) colon is a label -// identifier followed by colon (inline | for | while | {) is a label +// identifier followed by colon (inline | for | while | l_brace) is a label // // identifier preceded by dot, not preceded by name, and followed by (, | => | == | != | rbrace | rparen | and | or | ;) is an enum literal // identifier preceded by dot and followed by = is a struct field initializer @@ -51,20 +97,253 @@ fn write_escaped(out: anytype, input: []const u8) !void { // identifier followed by { is a type // identifier after | is a bind -pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytype) !void { - try out.writeAll("
");
+const ContextToken = struct {
+    tag: std.zig.Token.Tag,
+    content: []const u8,
+    class: TokenClass = .needs_context,
+};
+
+const TokenClass = enum {
+    keyword,
+    string,
+    builtin,
+    type,
+    function,
+    label,
+    doc_comment,
+    literal_primitive,
+    literal_number,
+    symbology,
+    whitespace,
+    context_free,
+
+    needs_context,
+
+    pub fn name(self: @This()) []const u8 {
+        return switch (self) {
+            .doc_comment => "doc comment",
+            .literal_primitive => "literal primitive",
+            .literal_number => "literal number",
+            .symbology => "",
+            .context_free => "",
+            .whitespace => "",
+            .needs_context => @panic("too late"),
+            else => @tagName(self),
+        };
+    }
+};
+
+pub const ContextManager = struct {
+    // const Queue = std.TailQueue(ContextToken);
+
+    tokens: std.ArrayList(ContextToken),
+    allocator: std.mem.Allocator,
+
+    pub fn init(allocator: std.mem.Allocator) @This() {
+        return .{
+            .allocator = allocator,
+            .tokens = std.ArrayList(ContextToken).init(allocator),
+        };
+    }
+
+    pub fn deinit(self: *@This()) void {
+        self.tokens.deinit();
+    }
+
+    pub fn push_back(self: *@This(), token: ContextToken) !void {
+        try self.tokens.append(token);
+    }
+
+    fn print_span(content: []const u8, class: TokenClass, out: anytype) !void {
+        const classname = class.name();
+
+        if (classname.len > 0) {
+            try out.print("", .{classname});
+            try write_escaped(out, content, class);
+            try out.writeAll("");
+        } else {
+            try write_escaped(out, content, class);
+        }
+    }
+
+    fn print_fused_span(tokens: []ContextToken, start: usize, end: usize, out: anytype) !void {
+        const classname = tokens[start].class.name();
+
+        if (classname.len > 0) try out.print("", .{classname});
+
+        for (tokens[start..end]) |*token| {
+            try write_escaped(out, token.content, tokens[start].class);
+        }
+
+        if (classname.len > 0) try out.writeAll("");
+    }
+
+    pub fn process(self: *@This(), out: anytype) !void {
+        const tokens = self.tokens.items;
+        if (tokens.len == 0) return;
+
+        for (tokens, 0..) |*token, idx| {
+            if (token.class == .needs_context)
+                if (!contextualize_identifier(tokens, idx)) @panic("failed to context");
+        }
+
+        var idx: usize = 0;
+        while (idx < tokens.len) : (idx += 1) {
+            const span_start = idx;
+            const token = &tokens[idx];
+            // std.debug.print("tok {d}: {s} {}\n", .{ idx, token.content, token.class });
+
+            var lookahead = idx + 1;
+            while (lookahead < tokens.len) : (lookahead += 1) {
+                // std.debug.print("look {d}: {s} {}\n", .{ lookahead, tokens[lookahead].content, tokens[lookahead].class });
+                if (tokens[lookahead].class != .whitespace) {
+                    if (tokens[lookahead].class == token.class)
+                        idx = lookahead
+                    else
+                        break;
+                } else {
+                    if (std.mem.containsAtLeast(u8, tokens[lookahead].content, 1, "\n")) break;
+                }
+            }
+            if (idx > span_start) {
+                try print_fused_span(tokens, span_start, idx + 1, out);
+            } else {
+                try print_span(token.content, token.class, out);
+            }
+        }
+    }
+
+    fn contextualize_identifier(tokens: []ContextToken, current: usize) bool {
+        return (contextualize_function(tokens, current) or
+            contextualize_builtin_type(tokens, current) or
+            contextualize_label(tokens, current) or
+            contextualize_fallback(tokens, current));
+    }
+
+    fn contextualize_function(tokens: []ContextToken, current: usize) bool {
+        const prev = prev_valid(tokens, current) orelse return false;
+
+        if (tokens[prev].tag == .keyword_fn) {
+            tokens[current].class = .function;
+            return true;
+        }
+
+        return false;
+    }
+
+    fn contextualize_builtin_type(tokens: []ContextToken, current: usize) bool {
+        const content = tokens[current].content;
+
+        const is_int = blk: {
+            if ((content[0] != 'i' and content[0] != 'u') or content.len < 2 or content.len > 6)
+                break :blk false;
+
+            for (content[1..]) |char|
+                if (char < '0' or char > '9') break :blk false;
+
+            break :blk true;
+        };
+
+        if (is_int or is_type(content)) {
+            tokens[current].class = .type;
+            return true;
+        }
+
+        return false;
+    }
+
+    fn contextualize_label(tokens: []ContextToken, current: usize) bool {
+        blk: {
+            const prev = prev_valid(tokens, current) orelse break :blk;
+
+            if (tokens[prev].tag == .colon) {
+                const prev2 = prev_valid(tokens, prev) orelse break :blk;
+
+                switch (tokens[prev2].tag) {
+                    .keyword_break, .keyword_continue => {
+                        tokens[prev].class = .label;
+                        tokens[current].class = .label;
+                        return true;
+                    },
+                    else => break :blk,
+                }
+            }
+        }
+
+        blk: {
+            const next = next_valid(tokens, current) orelse break :blk;
+
+            if (tokens[next].tag == .colon) {
+                const next2 = next_valid(tokens, next) orelse break :blk;
+
+                switch (tokens[next2].tag) {
+                    .keyword_inline, .keyword_for, .keyword_while, .l_brace => {
+                        tokens[current].class = .label;
+                        tokens[next].class = .label;
+                        return true;
+                    },
+                    else => break :blk,
+                }
+            }
+        }
+
+        return false;
+    }
+
+    fn contextualize_fallback(tokens: []ContextToken, current: usize) bool {
+        tokens[current].class = .context_free;
+        return true;
+    }
+
+    fn next_valid(tokens: []ContextToken, current: usize) ?usize {
+        var check = current + 1;
+        while (check < tokens.len) : (check += 1) {
+            if (tokens[check].class != .whitespace) return check;
+        }
+        return null;
+    }
+
+    fn prev_valid(tokens: []ContextToken, current: usize) ?usize {
+        if (current == 0) return null;
+
+        var check = current - 1;
+        while (check > 0) : (check -= 1) {
+            if (tokens[check].class != .whitespace) return check;
+        }
+        if (tokens[check].class != .whitespace) return check;
+        return null;
+    }
+};
+
+pub fn trimZ(comptime T: type, input: [:0]T, trimmer: []const T) [:0]T {
+    var begin: usize = 0;
+    var end: usize = input.len;
+    while (begin < end and std.mem.indexOfScalar(T, trimmer, input[begin]) != null) : (begin += 1) {}
+    while (end > begin and std.mem.indexOfScalar(T, trimmer, input[end - 1]) != null) : (end -= 1) {}
+    input[end] = 0;
+    return input[begin..end :0];
+}
+
+pub fn write_tokenized_html(raw_src: [:0]u8, allocator: std.mem.Allocator, out: anytype, full: bool) !void {
+    const src = trimZ(u8, raw_src, "\n");
     var tokenizer = std.zig.Tokenizer.init(src);
-    var index: usize = 0;
-    var next_tok_is_fn = false;
+    var last_token_end: usize = 0;
+
+    if (full) try out.writeAll(html_preamble);
+    try out.writeAll("
");
+    var manager = ContextManager.init(allocator);
+    defer manager.deinit();
+
     while (true) {
-        const prev_tok_was_fn = next_tok_is_fn;
-        next_tok_is_fn = false;
-
         const token = tokenizer.next();
-        // short circuit on EOF to avoid
-        if (token.tag == .eof) break;
+        if (last_token_end < token.loc.start) {
+            try manager.push_back(.{
+                .tag = .invalid, // TODO: this is a big hack
+                .content = src[last_token_end..token.loc.start],
+                .class = .whitespace,
+            });
+        }
 
-        try write_escaped(out, src[index..token.loc.start]);
         switch (token.tag) {
             .eof => break,
 
@@ -116,83 +395,72 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
             .keyword_allowzero,
             .keyword_while,
             .keyword_anytype,
-            => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
-            },
+            .keyword_fn,
+            => try manager.push_back(.{
+                .tag = token.tag,
+                .content = src[token.loc.start..token.loc.end],
+                .class = .keyword,
+            }),
 
-            .keyword_fn => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
-                next_tok_is_fn = true;
-            },
-
-            .string_literal, .char_literal => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
-            },
+            .string_literal,
+            .char_literal,
+            => try manager.push_back(.{
+                .tag = token.tag,
+                .content = src[token.loc.start..token.loc.end],
+                .class = .string,
+            }),
 
             .multiline_string_literal_line => {
-                // multiline string literals contain a newline
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start .. token.loc.end - 1]);
-                try out.writeAll("\n");
+                try manager.push_back(.{
+                    .tag = token.tag,
+                    .content = src[token.loc.start .. token.loc.end - 1],
+                    .class = .string,
+                });
+                // multiline string literals contain a newline, but we don't want to
+                // tokenize it like that.
+                try manager.push_back(.{
+                    .tag = .invalid,
+                    .content = src[token.loc.end - 1 .. token.loc.end],
+                    .class = .whitespace,
+                });
             },
 
-            .builtin => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
-            },
+            .builtin => try manager.push_back(.{
+                .tag = token.tag,
+                .content = src[token.loc.start..token.loc.end],
+                .class = .builtin,
+            }),
 
             .doc_comment,
             .container_doc_comment,
             => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
+                try manager.push_back(.{
+                    .tag = token.tag,
+                    .content = src[token.loc.start..token.loc.end],
+                    .class = .doc_comment,
+                });
             },
 
             .identifier => {
-                if (prev_tok_was_fn) {
-                    try out.writeAll("");
-                    try write_escaped(out, src[token.loc.start..token.loc.end]);
-                    try out.writeAll("");
-                    print("function: {s}\n", .{src[token.loc.start..token.loc.end]});
-                } else {
-                    print("identifier: {s}\n", .{src[token.loc.start..token.loc.end]});
-                    const is_int = blk: {
-                        if (src[token.loc.start] != 'i' and src[token.loc.start] != 'u')
-                            break :blk false;
-                        var i = token.loc.start + 1;
-                        if (i == token.loc.end)
-                            break :blk false;
-                        while (i != token.loc.end) : (i += 1) {
-                            if (src[i] < '0' or src[i] > '9')
-                                break :blk false;
-                        }
-                        break :blk true;
-                    };
-                    if (is_int or is_type(src[token.loc.start..token.loc.end])) {
-                        try out.writeAll("");
-                        try write_escaped(out, src[token.loc.start..token.loc.end]);
-                        try out.writeAll("");
-                    } else {
-                        try out.writeAll("");
-                        try write_escaped(out, src[token.loc.start..token.loc.end]);
-                        try out.writeAll("");
-                    }
-                }
+                const content = src[token.loc.start..token.loc.end];
+                try manager.push_back(.{
+                    .tag = token.tag,
+                    .content = content,
+                    .class = if (mem.eql(u8, content, "undefined") or
+                        mem.eql(u8, content, "null") or
+                        mem.eql(u8, content, "true") or
+                        mem.eql(u8, content, "false"))
+                        .literal_primitive
+                    else
+                        .needs_context,
+                });
             },
 
-            .number_literal => {
-                try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                try out.writeAll("");
-            },
+            .number_literal => try manager.push_back(.{
+                .tag = token.tag,
+                .content = src[token.loc.start..token.loc.end],
+                .class = .literal_number,
+            }),
 
             .bang,
             .pipe,
@@ -256,21 +524,24 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
             .asterisk_pipe_equal,
             .angle_bracket_angle_bracket_left_pipe,
             .angle_bracket_angle_bracket_left_pipe_equal,
-            => {
-                // try out.writeAll("");
-                try write_escaped(out, src[token.loc.start..token.loc.end]);
-                // try out.writeAll("");
-            },
-            .invalid, .invalid_periodasterisks => return parseError(
-                src,
-                token,
-                "syntax error",
-                .{},
-            ),
+            => try manager.push_back(.{
+                .tag = token.tag,
+                .content = src[token.loc.start..token.loc.end],
+                .class = .symbology,
+            }),
+
+            .invalid,
+            .invalid_periodasterisks,
+            => return parseError(src, token, "syntax error", .{}),
         }
-        index = token.loc.end;
+
+        last_token_end = token.loc.end;
     }
+
+    try manager.process(out);
+
     try out.writeAll("
"); + if (full) try out.writeAll(html_epilogue); } // TODO: this function returns anyerror, interesting @@ -322,6 +593,7 @@ const Location = struct { line_start: usize, line_end: usize, }; + fn getTokenLocation(src: []const u8, token: std.zig.Token) Location { var loc = Location{ .line = 0, @@ -346,6 +618,53 @@ fn getTokenLocation(src: []const u8, token: std.zig.Token) Location { return loc; } +const html_preamble = + \\ + \\ + \\ + \\ + \\ + \\ +; + +const html_epilogue = + \\ + \\ + \\ +; + const tokenator = cmd: { var cmd = noclip.CommandBuilder(TokCtx){ .description = @@ -353,7 +672,10 @@ const tokenator = cmd: { \\ \\Each file provided on the command line will be tokenized and the output will \\be written to [filename].html. For example, 'tokenator foo.zig bar.zig' will - \\write foo.zig.html and bar.zig.html + \\write foo.zig.html and bar.zig.html. Files are written directly, and if an + \\error occurs while processing a file, partial output will occur. When + \\processing multiple files, a failure will exit without processing any + \\successive files. Inputs should be less than 1MB in size. \\ \\If the --stdout flag is provided, output will be written to the standard \\output instead of to named files. Each file written to stdout will be @@ -366,6 +688,12 @@ const tokenator = cmd: { .default = false, .description = "write output to stdout instead of to files", }); + cmd.simple_flag(.{ + .name = "full", + .truthy = .{ .short_tag = "-f", .long_tag = "--full" }, + .default = false, + .description = "write full HTML files rather than just the pre fragment", + }); cmd.add_argument(.{ .OutputType = []const u8, .multi = true }, .{ .name = "files" }); break :cmd cmd; }; @@ -390,14 +718,10 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void { 0, ); }; - defer context.allocator.free(srcbuf); - var writebuf = std.ArrayList(u8).init(context.allocator); - defer writebuf.deinit(); - if (parameters.write_stdout) { - try write_tokenized_html(srcbuf, context.allocator, stdout); + try write_tokenized_html(srcbuf, context.allocator, stdout, parameters.full); try stdout.writeByte(0); } else { const outname = try std.mem.join(context.allocator, ".", &[_][]const u8{ file_name, "html" }); @@ -407,7 +731,7 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void { const output = try fs.cwd().createFile(outname, .{}); defer output.close(); - try write_tokenized_html(srcbuf, context.allocator, output.writer()); + try write_tokenized_html(srcbuf, context.allocator, output.writer(), parameters.full); } } }