");
+ state = .normal;
+ },
+ else => {
+ try escape_char(out, c);
+ },
},
- else => try out.writeByte(c),
}
}
}
// TODO: use more context to get better token resolution
// identifier preceded by (break | continue) colon is a label
-// identifier followed by colon (inline | for | while | {) is a label
+// identifier followed by colon (inline | for | while | l_brace) is a label
//
// identifier preceded by dot, not preceded by name, and followed by (, | => | == | != | rbrace | rparen | and | or | ;) is an enum literal
// identifier preceded by dot and followed by = is a struct field initializer
@@ -51,20 +97,253 @@ fn write_escaped(out: anytype, input: []const u8) !void {
// identifier followed by { is a type
// identifier after | is a bind
-pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytype) !void {
- try out.writeAll("");
+const ContextToken = struct {
+ tag: std.zig.Token.Tag,
+ content: []const u8,
+ class: TokenClass = .needs_context,
+};
+
+const TokenClass = enum {
+ keyword,
+ string,
+ builtin,
+ type,
+ function,
+ label,
+ doc_comment,
+ literal_primitive,
+ literal_number,
+ symbology,
+ whitespace,
+ context_free,
+
+ needs_context,
+
+ pub fn name(self: @This()) []const u8 {
+ return switch (self) {
+ .doc_comment => "doc comment",
+ .literal_primitive => "literal primitive",
+ .literal_number => "literal number",
+ .symbology => "",
+ .context_free => "",
+ .whitespace => "",
+ .needs_context => @panic("too late"),
+ else => @tagName(self),
+ };
+ }
+};
+
+pub const ContextManager = struct {
+ // const Queue = std.TailQueue(ContextToken);
+
+ tokens: std.ArrayList(ContextToken),
+ allocator: std.mem.Allocator,
+
+ pub fn init(allocator: std.mem.Allocator) @This() {
+ return .{
+ .allocator = allocator,
+ .tokens = std.ArrayList(ContextToken).init(allocator),
+ };
+ }
+
+ pub fn deinit(self: *@This()) void {
+ self.tokens.deinit();
+ }
+
+ pub fn push_back(self: *@This(), token: ContextToken) !void {
+ try self.tokens.append(token);
+ }
+
+ fn print_span(content: []const u8, class: TokenClass, out: anytype) !void {
+ const classname = class.name();
+
+ if (classname.len > 0) {
+ try out.print("", .{classname});
+ try write_escaped(out, content, class);
+ try out.writeAll("");
+ } else {
+ try write_escaped(out, content, class);
+ }
+ }
+
+ fn print_fused_span(tokens: []ContextToken, start: usize, end: usize, out: anytype) !void {
+ const classname = tokens[start].class.name();
+
+ if (classname.len > 0) try out.print("", .{classname});
+
+ for (tokens[start..end]) |*token| {
+ try write_escaped(out, token.content, tokens[start].class);
+ }
+
+ if (classname.len > 0) try out.writeAll("");
+ }
+
+ pub fn process(self: *@This(), out: anytype) !void {
+ const tokens = self.tokens.items;
+ if (tokens.len == 0) return;
+
+ for (tokens, 0..) |*token, idx| {
+ if (token.class == .needs_context)
+ if (!contextualize_identifier(tokens, idx)) @panic("failed to context");
+ }
+
+ var idx: usize = 0;
+ while (idx < tokens.len) : (idx += 1) {
+ const span_start = idx;
+ const token = &tokens[idx];
+ // std.debug.print("tok {d}: {s} {}\n", .{ idx, token.content, token.class });
+
+ var lookahead = idx + 1;
+ while (lookahead < tokens.len) : (lookahead += 1) {
+ // std.debug.print("look {d}: {s} {}\n", .{ lookahead, tokens[lookahead].content, tokens[lookahead].class });
+ if (tokens[lookahead].class != .whitespace) {
+ if (tokens[lookahead].class == token.class)
+ idx = lookahead
+ else
+ break;
+ } else {
+ if (std.mem.containsAtLeast(u8, tokens[lookahead].content, 1, "\n")) break;
+ }
+ }
+ if (idx > span_start) {
+ try print_fused_span(tokens, span_start, idx + 1, out);
+ } else {
+ try print_span(token.content, token.class, out);
+ }
+ }
+ }
+
+ fn contextualize_identifier(tokens: []ContextToken, current: usize) bool {
+ return (contextualize_function(tokens, current) or
+ contextualize_builtin_type(tokens, current) or
+ contextualize_label(tokens, current) or
+ contextualize_fallback(tokens, current));
+ }
+
+ fn contextualize_function(tokens: []ContextToken, current: usize) bool {
+ const prev = prev_valid(tokens, current) orelse return false;
+
+ if (tokens[prev].tag == .keyword_fn) {
+ tokens[current].class = .function;
+ return true;
+ }
+
+ return false;
+ }
+
+ fn contextualize_builtin_type(tokens: []ContextToken, current: usize) bool {
+ const content = tokens[current].content;
+
+ const is_int = blk: {
+ if ((content[0] != 'i' and content[0] != 'u') or content.len < 2 or content.len > 6)
+ break :blk false;
+
+ for (content[1..]) |char|
+ if (char < '0' or char > '9') break :blk false;
+
+ break :blk true;
+ };
+
+ if (is_int or is_type(content)) {
+ tokens[current].class = .type;
+ return true;
+ }
+
+ return false;
+ }
+
+ fn contextualize_label(tokens: []ContextToken, current: usize) bool {
+ blk: {
+ const prev = prev_valid(tokens, current) orelse break :blk;
+
+ if (tokens[prev].tag == .colon) {
+ const prev2 = prev_valid(tokens, prev) orelse break :blk;
+
+ switch (tokens[prev2].tag) {
+ .keyword_break, .keyword_continue => {
+ tokens[prev].class = .label;
+ tokens[current].class = .label;
+ return true;
+ },
+ else => break :blk,
+ }
+ }
+ }
+
+ blk: {
+ const next = next_valid(tokens, current) orelse break :blk;
+
+ if (tokens[next].tag == .colon) {
+ const next2 = next_valid(tokens, next) orelse break :blk;
+
+ switch (tokens[next2].tag) {
+ .keyword_inline, .keyword_for, .keyword_while, .l_brace => {
+ tokens[current].class = .label;
+ tokens[next].class = .label;
+ return true;
+ },
+ else => break :blk,
+ }
+ }
+ }
+
+ return false;
+ }
+
+ fn contextualize_fallback(tokens: []ContextToken, current: usize) bool {
+ tokens[current].class = .context_free;
+ return true;
+ }
+
+ fn next_valid(tokens: []ContextToken, current: usize) ?usize {
+ var check = current + 1;
+ while (check < tokens.len) : (check += 1) {
+ if (tokens[check].class != .whitespace) return check;
+ }
+ return null;
+ }
+
+ fn prev_valid(tokens: []ContextToken, current: usize) ?usize {
+ if (current == 0) return null;
+
+ var check = current - 1;
+ while (check > 0) : (check -= 1) {
+ if (tokens[check].class != .whitespace) return check;
+ }
+ if (tokens[check].class != .whitespace) return check;
+ return null;
+ }
+};
+
+pub fn trimZ(comptime T: type, input: [:0]T, trimmer: []const T) [:0]T {
+ var begin: usize = 0;
+ var end: usize = input.len;
+ while (begin < end and std.mem.indexOfScalar(T, trimmer, input[begin]) != null) : (begin += 1) {}
+ while (end > begin and std.mem.indexOfScalar(T, trimmer, input[end - 1]) != null) : (end -= 1) {}
+ input[end] = 0;
+ return input[begin..end :0];
+}
+
+pub fn write_tokenized_html(raw_src: [:0]u8, allocator: std.mem.Allocator, out: anytype, full: bool) !void {
+ const src = trimZ(u8, raw_src, "\n");
var tokenizer = std.zig.Tokenizer.init(src);
- var index: usize = 0;
- var next_tok_is_fn = false;
+ var last_token_end: usize = 0;
+
+ if (full) try out.writeAll(html_preamble);
+ try out.writeAll("");
+ var manager = ContextManager.init(allocator);
+ defer manager.deinit();
+
while (true) {
- const prev_tok_was_fn = next_tok_is_fn;
- next_tok_is_fn = false;
-
const token = tokenizer.next();
- // short circuit on EOF to avoid
- if (token.tag == .eof) break;
+ if (last_token_end < token.loc.start) {
+ try manager.push_back(.{
+ .tag = .invalid, // TODO: this is a big hack
+ .content = src[last_token_end..token.loc.start],
+ .class = .whitespace,
+ });
+ }
- try write_escaped(out, src[index..token.loc.start]);
switch (token.tag) {
.eof => break,
@@ -116,83 +395,72 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
.keyword_allowzero,
.keyword_while,
.keyword_anytype,
- => {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- },
+ .keyword_fn,
+ => try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .keyword,
+ }),
- .keyword_fn => {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- next_tok_is_fn = true;
- },
-
- .string_literal, .char_literal => {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- },
+ .string_literal,
+ .char_literal,
+ => try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .string,
+ }),
.multiline_string_literal_line => {
- // multiline string literals contain a newline
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start .. token.loc.end - 1]);
- try out.writeAll("\n");
+ try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start .. token.loc.end - 1],
+ .class = .string,
+ });
+ // multiline string literals contain a newline, but we don't want to
+ // tokenize it like that.
+ try manager.push_back(.{
+ .tag = .invalid,
+ .content = src[token.loc.end - 1 .. token.loc.end],
+ .class = .whitespace,
+ });
},
- .builtin => {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- },
+ .builtin => try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .builtin,
+ }),
.doc_comment,
.container_doc_comment,
=> {
- try out.writeAll("");
+ try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .doc_comment,
+ });
},
.identifier => {
- if (prev_tok_was_fn) {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- print("function: {s}\n", .{src[token.loc.start..token.loc.end]});
- } else {
- print("identifier: {s}\n", .{src[token.loc.start..token.loc.end]});
- const is_int = blk: {
- if (src[token.loc.start] != 'i' and src[token.loc.start] != 'u')
- break :blk false;
- var i = token.loc.start + 1;
- if (i == token.loc.end)
- break :blk false;
- while (i != token.loc.end) : (i += 1) {
- if (src[i] < '0' or src[i] > '9')
- break :blk false;
- }
- break :blk true;
- };
- if (is_int or is_type(src[token.loc.start..token.loc.end])) {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- } else {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- }
- }
+ const content = src[token.loc.start..token.loc.end];
+ try manager.push_back(.{
+ .tag = token.tag,
+ .content = content,
+ .class = if (mem.eql(u8, content, "undefined") or
+ mem.eql(u8, content, "null") or
+ mem.eql(u8, content, "true") or
+ mem.eql(u8, content, "false"))
+ .literal_primitive
+ else
+ .needs_context,
+ });
},
- .number_literal => {
- try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- try out.writeAll("");
- },
+ .number_literal => try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .literal_number,
+ }),
.bang,
.pipe,
@@ -256,21 +524,24 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
.asterisk_pipe_equal,
.angle_bracket_angle_bracket_left_pipe,
.angle_bracket_angle_bracket_left_pipe_equal,
- => {
- // try out.writeAll("");
- try write_escaped(out, src[token.loc.start..token.loc.end]);
- // try out.writeAll("");
- },
- .invalid, .invalid_periodasterisks => return parseError(
- src,
- token,
- "syntax error",
- .{},
- ),
+ => try manager.push_back(.{
+ .tag = token.tag,
+ .content = src[token.loc.start..token.loc.end],
+ .class = .symbology,
+ }),
+
+ .invalid,
+ .invalid_periodasterisks,
+ => return parseError(src, token, "syntax error", .{}),
}
- index = token.loc.end;
+
+ last_token_end = token.loc.end;
}
+
+ try manager.process(out);
+
try out.writeAll("
");
+ if (full) try out.writeAll(html_epilogue);
}
// TODO: this function returns anyerror, interesting
@@ -322,6 +593,7 @@ const Location = struct {
line_start: usize,
line_end: usize,
};
+
fn getTokenLocation(src: []const u8, token: std.zig.Token) Location {
var loc = Location{
.line = 0,
@@ -346,6 +618,53 @@ fn getTokenLocation(src: []const u8, token: std.zig.Token) Location {
return loc;
}
+const html_preamble =
+ \\
+ \\
+ \\
+ \\
+ \\
+ \\
+;
+
+const html_epilogue =
+ \\
+ \\
+ \\
+;
+
const tokenator = cmd: {
var cmd = noclip.CommandBuilder(TokCtx){
.description =
@@ -353,7 +672,10 @@ const tokenator = cmd: {
\\
\\Each file provided on the command line will be tokenized and the output will
\\be written to [filename].html. For example, 'tokenator foo.zig bar.zig' will
- \\write foo.zig.html and bar.zig.html
+ \\write foo.zig.html and bar.zig.html. Files are written directly, and if an
+ \\error occurs while processing a file, partial output will occur. When
+ \\processing multiple files, a failure will exit without processing any
+ \\successive files. Inputs should be less than 1MB in size.
\\
\\If the --stdout flag is provided, output will be written to the standard
\\output instead of to named files. Each file written to stdout will be
@@ -366,6 +688,12 @@ const tokenator = cmd: {
.default = false,
.description = "write output to stdout instead of to files",
});
+ cmd.simple_flag(.{
+ .name = "full",
+ .truthy = .{ .short_tag = "-f", .long_tag = "--full" },
+ .default = false,
+ .description = "write full HTML files rather than just the pre fragment",
+ });
cmd.add_argument(.{ .OutputType = []const u8, .multi = true }, .{ .name = "files" });
break :cmd cmd;
};
@@ -390,14 +718,10 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void {
0,
);
};
-
defer context.allocator.free(srcbuf);
- var writebuf = std.ArrayList(u8).init(context.allocator);
- defer writebuf.deinit();
-
if (parameters.write_stdout) {
- try write_tokenized_html(srcbuf, context.allocator, stdout);
+ try write_tokenized_html(srcbuf, context.allocator, stdout, parameters.full);
try stdout.writeByte(0);
} else {
const outname = try std.mem.join(context.allocator, ".", &[_][]const u8{ file_name, "html" });
@@ -407,7 +731,7 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void {
const output = try fs.cwd().createFile(outname, .{});
defer output.close();
- try write_tokenized_html(srcbuf, context.allocator, output.writer());
+ try write_tokenized_html(srcbuf, context.allocator, output.writer(), parameters.full);
}
}
}