// this borrows code from zig-doctest // zig-doctest is distributed under the MIT license Copyright (c) 2020 Loris Cro // see: https://github.com/kristoff-it/zig-doctest/blob/db507d803dd23e2585166f5b7e479ffc96d8b5c9/LICENSE const noclip = @import("noclip"); const std = @import("std"); const mem = std.mem; const fs = std.fs; const print = std.debug.print; inline fn escape_char(out: anytype, char: u8) !void { return try switch (char) { '&' => out.writeAll("&"), '<' => out.writeAll("<"), '>' => out.writeAll(">"), '"' => out.writeAll("""), else => out.writeByte(char), }; } fn write_escaped(out: anytype, input: []const u8, class: TokenClass) !void { if (class == .whitespace) { try write_whitespace(out, input); } else { for (input) |c| try escape_char(out, c); } } fn write_whitespace(out: anytype, input: []const u8) !void { var state: enum { normal, maybe_comment, maybe_docstring, comment } = .normal; for (input) |c| { switch (state) { .normal => switch (c) { '/' => state = .maybe_comment, '\n' => try out.writeAll("\n"), else => try escape_char(out, c), }, .maybe_comment => switch (c) { '/' => { state = .maybe_docstring; }, '\n' => { try out.writeAll("\n"); state = .normal; }, else => { try out.writeByte('/'); try escape_char(out, c); state = .normal; }, }, .maybe_docstring => switch (c) { '\n' => { // actually it was an empty comment lol cool try out.writeAll("//\n"); state = .normal; }, '/', '!' => { // it is a docstring, so don't respan it try out.writeAll("//"); try out.writeByte(c); state = .normal; }, else => { // this is also a comment try out.writeAll("//"); try escape_char(out, c); state = .comment; }, }, .comment => switch (c) { '\n' => { try out.writeAll("\n"); state = .normal; }, else => { try escape_char(out, c); }, }, } } } // TODO: use more context to get better token resolution // // identifier preceded by dot, not preceded by name, and followed by (, | => | == | != | rbrace | rparen | and | or | ;) is an enum literal // // identifier followed by ( is always a function call // // identifier preceded by : is a type until = or , or ) (except after [, where its the terminator) // identifier followed by { is a type // identifier after | is a bind const ContextToken = struct { tag: std.zig.Token.Tag, content: []const u8, class: TokenClass = .needs_context, }; const TokenClass = enum { keyword, string, builtin, type, function, label, doc_comment, literal_primitive, literal_number, literal_enum, field_name, symbology, whitespace, context_free, needs_context, pub fn name(self: @This()) []const u8 { return switch (self) { .doc_comment => "doc comment", .literal_primitive => "literal primitive", .literal_number => "literal number", .literal_enum => "literal enum", .field_name => "field-name", .symbology => "", .context_free => "", .whitespace => "", .needs_context => @panic("too late"), else => @tagName(self), }; } }; pub const ContextManager = struct { // const Queue = std.TailQueue(ContextToken); tokens: std.ArrayList(ContextToken), allocator: std.mem.Allocator, pub fn init(allocator: std.mem.Allocator) @This() { return .{ .allocator = allocator, .tokens = std.ArrayList(ContextToken).init(allocator), }; } pub fn deinit(self: *@This()) void { self.tokens.deinit(); } pub fn push_back(self: *@This(), token: ContextToken) !void { try self.tokens.append(token); } fn print_span(content: []const u8, class: TokenClass, out: anytype) !void { const classname = class.name(); if (classname.len > 0) { try out.print("", .{classname}); try write_escaped(out, content, class); try out.writeAll(""); } else { try write_escaped(out, content, class); } } fn print_fused_span(tokens: []ContextToken, start: usize, end: usize, out: anytype) !void { const classname = tokens[start].class.name(); if (classname.len > 0) try out.print("", .{classname}); for (tokens[start..end]) |*token| { try write_escaped(out, token.content, tokens[start].class); } if (classname.len > 0) try out.writeAll(""); } pub fn process(self: *@This(), out: anytype) !void { const tokens = self.tokens.items; if (tokens.len == 0) return; for (tokens, 0..) |*token, idx| { if (token.class == .needs_context) if (!contextualize_identifier(tokens, idx)) @panic("failed to context"); } var idx: usize = 0; while (idx < tokens.len) : (idx += 1) { const span_start = idx; const token = &tokens[idx]; // std.debug.print("tok {d}: {s} {}\n", .{ idx, token.content, token.class }); var lookahead = idx + 1; while (lookahead < tokens.len) : (lookahead += 1) { // std.debug.print("look {d}: {s} {}\n", .{ lookahead, tokens[lookahead].content, tokens[lookahead].class }); if (tokens[lookahead].class != .whitespace) { if (tokens[lookahead].class == token.class) idx = lookahead else break; } else { if (std.mem.containsAtLeast(u8, tokens[lookahead].content, 1, "\n")) break; } } if (idx > span_start) { try print_fused_span(tokens, span_start, idx + 1, out); } else { try print_span(token.content, token.class, out); } } } fn contextualize_identifier(tokens: []ContextToken, current: usize) bool { return (contextualize_function(tokens, current) or contextualize_builtin_type(tokens, current) or contextualize_label(tokens, current) or contextualize_struct_field(tokens, current) or contextualize_fallback(tokens, current)); } fn contextualize_function(tokens: []ContextToken, current: usize) bool { const prev = prev_valid(tokens, current) orelse return false; if (tokens[prev].tag == .keyword_fn) { tokens[current].class = .function; return true; } if (current < tokens.len - 1 and tokens[current + 1].tag == .l_paren) { tokens[current].class = .function; return true; } return false; } fn contextualize_builtin_type(tokens: []ContextToken, current: usize) bool { const content = tokens[current].content; const is_int = blk: { if ((content[0] != 'i' and content[0] != 'u') or content.len < 2 or content.len > 6) break :blk false; for (content[1..]) |char| if (char < '0' or char > '9') break :blk false; break :blk true; }; if (is_int or is_type(content)) { tokens[current].class = .type; return true; } return false; } fn contextualize_label(tokens: []ContextToken, current: usize) bool { blk: { const prev = prev_valid(tokens, current) orelse break :blk; if (tokens[prev].tag == .colon) { const prev2 = prev_valid(tokens, prev) orelse break :blk; switch (tokens[prev2].tag) { .keyword_break, .keyword_continue => { tokens[prev].class = .label; tokens[current].class = .label; return true; }, else => break :blk, } } } blk: { const next = next_valid(tokens, current) orelse break :blk; if (tokens[next].tag == .colon) { const next2 = next_valid(tokens, next) orelse break :blk; switch (tokens[next2].tag) { .keyword_inline, .keyword_for, .keyword_while, .l_brace => { tokens[current].class = .label; tokens[next].class = .label; return true; }, else => break :blk, } } } return false; } fn contextualize_struct_field(tokens: []ContextToken, current: usize) bool { if (current == 0) return false; if (tokens[current - 1].tag != .period) return false; const precursor = prev_valid(tokens, current - 1) orelse return false; const succesor = next_valid(tokens, current) orelse return false; if ((tokens[precursor].tag == .l_brace or tokens[precursor].tag == .comma) and tokens[succesor].tag == .equal) { tokens[current - 1].class = .field_name; tokens[current].class = .field_name; return true; } return false; } fn contextualize_fallback(tokens: []ContextToken, current: usize) bool { tokens[current].class = .context_free; return true; } fn next_valid(tokens: []ContextToken, current: usize) ?usize { var check = current + 1; while (check < tokens.len) : (check += 1) { if (tokens[check].class != .whitespace) return check; } return null; } fn prev_valid(tokens: []ContextToken, current: usize) ?usize { if (current == 0) return null; var check = current - 1; while (check > 0) : (check -= 1) { if (tokens[check].class != .whitespace) return check; } if (tokens[check].class != .whitespace) return check; return null; } }; pub fn trimZ(comptime T: type, input: [:0]T, trimmer: []const T) [:0]T { var begin: usize = 0; var end: usize = input.len; while (begin < end and std.mem.indexOfScalar(T, trimmer, input[begin]) != null) : (begin += 1) {} while (end > begin and std.mem.indexOfScalar(T, trimmer, input[end - 1]) != null) : (end -= 1) {} input[end] = 0; return input[begin..end :0]; } pub fn write_tokenized_html(raw_src: [:0]u8, allocator: std.mem.Allocator, out: anytype, full: bool) !void { const src = trimZ(u8, raw_src, "\n"); var tokenizer = std.zig.Tokenizer.init(src); var last_token_end: usize = 0; if (full) try out.writeAll(html_preamble); try out.writeAll("

");
    var manager = ContextManager.init(allocator);
    defer manager.deinit();

    while (true) {
        const token = tokenizer.next();
        if (last_token_end < token.loc.start) {
            try manager.push_back(.{
                .tag = .invalid, // TODO: this is a big hack
                .content = src[last_token_end..token.loc.start],
                .class = .whitespace,
            });
        }

        switch (token.tag) {
            .eof => break,

            .keyword_addrspace,
            .keyword_align,
            .keyword_and,
            .keyword_asm,
            .keyword_async,
            .keyword_await,
            .keyword_break,
            .keyword_catch,
            .keyword_comptime,
            .keyword_const,
            .keyword_continue,
            .keyword_defer,
            .keyword_else,
            .keyword_enum,
            .keyword_errdefer,
            .keyword_error,
            .keyword_export,
            .keyword_extern,
            .keyword_for,
            .keyword_if,
            .keyword_inline,
            .keyword_noalias,
            .keyword_noinline,
            .keyword_nosuspend,
            .keyword_opaque,
            .keyword_or,
            .keyword_orelse,
            .keyword_packed,
            .keyword_anyframe,
            .keyword_pub,
            .keyword_resume,
            .keyword_return,
            .keyword_linksection,
            .keyword_callconv,
            .keyword_struct,
            .keyword_suspend,
            .keyword_switch,
            .keyword_test,
            .keyword_threadlocal,
            .keyword_try,
            .keyword_union,
            .keyword_unreachable,
            .keyword_usingnamespace,
            .keyword_var,
            .keyword_volatile,
            .keyword_allowzero,
            .keyword_while,
            .keyword_anytype,
            .keyword_fn,
            => try manager.push_back(.{
                .tag = token.tag,
                .content = src[token.loc.start..token.loc.end],
                .class = .keyword,
            }),

            .string_literal,
            .char_literal,
            => try manager.push_back(.{
                .tag = token.tag,
                .content = src[token.loc.start..token.loc.end],
                .class = .string,
            }),

            .multiline_string_literal_line => {
                try manager.push_back(.{
                    .tag = token.tag,
                    .content = src[token.loc.start .. token.loc.end - 1],
                    .class = .string,
                });
                // multiline string literals contain a newline, but we don't want to
                // tokenize it like that.
                try manager.push_back(.{
                    .tag = .invalid,
                    .content = src[token.loc.end - 1 .. token.loc.end],
                    .class = .whitespace,
                });
            },

            .builtin => try manager.push_back(.{
                .tag = token.tag,
                .content = src[token.loc.start..token.loc.end],
                .class = .builtin,
            }),

            .doc_comment,
            .container_doc_comment,
            => {
                try manager.push_back(.{
                    .tag = token.tag,
                    .content = src[token.loc.start..token.loc.end],
                    .class = .doc_comment,
                });
            },

            .identifier => {
                const content = src[token.loc.start..token.loc.end];
                try manager.push_back(.{
                    .tag = token.tag,
                    .content = content,
                    .class = if (mem.eql(u8, content, "undefined") or
                        mem.eql(u8, content, "null") or
                        mem.eql(u8, content, "true") or
                        mem.eql(u8, content, "false"))
                        .literal_primitive
                    else
                        .needs_context,
                });
            },

            .number_literal => try manager.push_back(.{
                .tag = token.tag,
                .content = src[token.loc.start..token.loc.end],
                .class = .literal_number,
            }),

            .bang,
            .pipe,
            .pipe_pipe,
            .pipe_equal,
            .equal,
            .equal_equal,
            .equal_angle_bracket_right,
            .bang_equal,
            .l_paren,
            .r_paren,
            .semicolon,
            .percent,
            .percent_equal,
            .l_brace,
            .r_brace,
            .l_bracket,
            .r_bracket,
            .period,
            .period_asterisk,
            .ellipsis2,
            .ellipsis3,
            .caret,
            .caret_equal,
            .plus,
            .plus_plus,
            .plus_equal,
            .plus_percent,
            .plus_percent_equal,
            .minus,
            .minus_equal,
            .minus_percent,
            .minus_percent_equal,
            .asterisk,
            .asterisk_equal,
            .asterisk_asterisk,
            .asterisk_percent,
            .asterisk_percent_equal,
            .arrow,
            .colon,
            .slash,
            .slash_equal,
            .comma,
            .ampersand,
            .ampersand_equal,
            .question_mark,
            .angle_bracket_left,
            .angle_bracket_left_equal,
            .angle_bracket_angle_bracket_left,
            .angle_bracket_angle_bracket_left_equal,
            .angle_bracket_right,
            .angle_bracket_right_equal,
            .angle_bracket_angle_bracket_right,
            .angle_bracket_angle_bracket_right_equal,
            .tilde,
            .plus_pipe,
            .plus_pipe_equal,
            .minus_pipe,
            .minus_pipe_equal,
            .asterisk_pipe,
            .asterisk_pipe_equal,
            .angle_bracket_angle_bracket_left_pipe,
            .angle_bracket_angle_bracket_left_pipe_equal,
            => try manager.push_back(.{
                .tag = token.tag,
                .content = src[token.loc.start..token.loc.end],
                .class = .symbology,
            }),

            .invalid,
            .invalid_periodasterisks,
            => return parseError(src, token, "syntax error", .{}),
        }

        last_token_end = token.loc.end;
    }

    try manager.process(out);

    try out.writeAll("

"); if (full) try out.writeAll(html_epilogue); } // TODO: this function returns anyerror, interesting fn parseError(src: []const u8, token: std.zig.Token, comptime fmt: []const u8, args: anytype) anyerror { const loc = getTokenLocation(src, token); // const args_prefix = .{ tokenizer.source_file_name, loc.line + 1, loc.column + 1 }; // print("{s}:{d}:{d}: error: " ++ fmt ++ "\n", args_prefix ++ args); const args_prefix = .{ loc.line + 1, loc.column + 1 }; print("{d}:{d}: error: " ++ fmt ++ "\n", args_prefix ++ args); if (loc.line_start <= loc.line_end) { print("{s}\n", .{src[loc.line_start..loc.line_end]}); { var i: usize = 0; while (i < loc.column) : (i += 1) { print(" ", .{}); } } { const caret_count = token.loc.end - token.loc.start; var i: usize = 0; while (i < caret_count) : (i += 1) { print("~", .{}); } } print("\n", .{}); } return error.ParseError; } const builtin_types = [_][]const u8{ "f16", "f32", "f64", "f128", "c_longdouble", "c_short", "c_ushort", "c_int", "c_uint", "c_long", "c_ulong", "c_longlong", "c_ulonglong", "c_char", "c_void", "void", "bool", "isize", "usize", "noreturn", "type", "anyerror", "comptime_int", "comptime_float", }; fn is_type(name: []const u8) bool { for (builtin_types) |t| { if (mem.eql(u8, t, name)) return true; } return false; } const Location = struct { line: usize, column: usize, line_start: usize, line_end: usize, }; fn getTokenLocation(src: []const u8, token: std.zig.Token) Location { var loc = Location{ .line = 0, .column = 0, .line_start = 0, .line_end = 0, }; for (src, 0..) |c, i| { if (i == token.loc.start) { loc.line_end = i; while (loc.line_end < src.len and src[loc.line_end] != '\n') : (loc.line_end += 1) {} return loc; } if (c == '\n') { loc.line += 1; loc.column = 0; loc.line_start = i + 1; } else { loc.column += 1; } } return loc; } pub fn tokenize_buffer( buffer: []const u8, allocator: std.mem.Allocator, writer: anytype, full_html: bool, ) !void { const intermediate = try allocator.dupeZ(u8, buffer); defer allocator.free(intermediate); try write_tokenized_html(intermediate, allocator, writer, full_html); } pub fn tokenize_file( file_name: []const u8, allocator: std.mem.Allocator, writer: anytype, full_html: bool, ) !void { const srcbuf = blk: { const file = fs.cwd().openFile(file_name, .{ .mode = .read_only }) catch |err| { std.debug.print("couldnt open {s}\n", .{file_name}); return err; }; defer file.close(); break :blk try file.readToEndAllocOptions( allocator, 1_000_000, null, @alignOf(u8), 0, ); }; defer allocator.free(srcbuf); try write_tokenized_html(srcbuf, allocator, writer, full_html); } const html_preamble = \\ \\ \\ \\ \\ \\ ; const html_epilogue = \\ \\ \\ ; const tokenator = cmd: { var cmd = noclip.CommandBuilder(*TokCtx){ .description = \\Tokenize one or more zig files into HTML. \\ \\Each file provided on the command line will be tokenized and the output will \\be written to [filename].html. For example, 'tokenator foo.zig bar.zig' will \\write foo.zig.html and bar.zig.html. Files are written directly, and if an \\error occurs while processing a file, partial output will occur. When \\processing multiple files, a failure will exit without processing any \\successive files. Inputs should be less than 1MB in size. \\ \\If the --stdout flag is provided, output will be written to the standard \\output instead of to named files. Each file written to stdout will be \\followed by a NUL character which acts as a separator for piping purposes. , }; cmd.simple_flag(.{ .name = "write_stdout", .truthy = .{ .long_tag = "--stdout" }, .default = false, .description = "write output to stdout instead of to files", }); cmd.simple_flag(.{ .name = "full", .truthy = .{ .short_tag = "-f", .long_tag = "--full" }, .default = false, .description = "write full HTML files rather than just the pre fragment", }); cmd.add_argument(.{ .OutputType = []const u8, .multi = true }, .{ .name = "files" }); break :cmd cmd; }; const TokCtx = struct { allocator: std.mem.Allocator, }; fn tokenize_files_cli(context: *TokCtx, parameters: tokenator.Output()) !void { const stdout = std.io.getStdOut().writer(); for (parameters.files.items) |file_name| { if (parameters.write_stdout) { try tokenize_file(file_name, context.allocator, stdout, parameters.full); try stdout.writeByte(0); } else { const outname = try std.mem.join(context.allocator, ".", &[_][]const u8{ file_name, "html" }); defer context.allocator.free(outname); const output = try fs.cwd().createFile(outname, .{}); defer output.close(); print("writing: {s}", .{outname}); errdefer print(" failed!\n", .{}); try tokenize_file(file_name, context.allocator, output.writer(), parameters.full); print(" done\n", .{}); } } } pub fn cli() !u8 { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); var ctx = TokCtx{ .allocator = allocator }; var arena = std.heap.ArenaAllocator.init(gpa.allocator()); defer arena.deinit(); var cli_parser = tokenator.create_parser(tokenize_files_cli, arena.allocator()); try cli_parser.execute(&ctx); return 0; }