docs: functioning contextualization of more tokens

This recognizes block labels. Actually implementing this successfully
took more attempts than I'd like to admit. I originally had a
streaming version using a tail queue, but it had problems that seemed
to be intractable. So instead, everything is just jammed in an
arraylist and processed as a whole once the tokenizing is complete. It
increases the maximum memory usage to store all the intermediates
during tokenizing the whole file, but it does work, and frankly I'm ok
with it using a few MB of memory. It can tokenize itself.
This commit is contained in:
torque 2023-04-06 03:26:11 -07:00
parent 3fda934808
commit e142fb5676
Signed by: torque
SSH Key Fingerprint: SHA256:nCrXefBNo6EbjNSQhv0nXmEg/VuNq3sMF5b8zETw3Tk

View File

@ -8,37 +8,83 @@ const mem = std.mem;
const fs = std.fs;
const print = std.debug.print;
fn write_escaped(out: anytype, input: []const u8) !void {
var comment = false;
inline fn escape_char(out: anytype, char: u8) !void {
return try switch (char) {
'&' => out.writeAll("&"),
'<' => out.writeAll("&lt;"),
'>' => out.writeAll("&gt;"),
'"' => out.writeAll("&quot;"),
else => out.writeByte(char),
};
}
for (input, 0..) |c, idx| {
switch (c) {
'&' => try out.writeAll("&amp;"),
'<' => try out.writeAll("&lt;"),
'>' => try out.writeAll("&gt;"),
'"' => try out.writeAll("&quot;"),
'\n' => {
if (comment) {
try out.writeAll("</span>");
comment = false;
}
try out.writeAll("</span>\n<span class=\"line\">");
fn write_escaped(out: anytype, input: []const u8, class: TokenClass) !void {
if (class == .whitespace) {
try write_whitespace(out, input);
} else {
for (input) |c| try escape_char(out, c);
}
}
fn write_whitespace(out: anytype, input: []const u8) !void {
var state: enum { normal, maybe_comment, maybe_docstring, comment } = .normal;
for (input) |c| {
switch (state) {
.normal => switch (c) {
'/' => state = .maybe_comment,
'\n' => try out.writeAll("</span>\n<span class=\"line\">"),
else => try escape_char(out, c),
},
'/' => {
if (input[idx + 1] == '/') {
try out.writeAll("<span class=\"comment\">");
comment = true;
}
try out.writeByte('/');
.maybe_comment => switch (c) {
'/' => {
state = .maybe_docstring;
},
'\n' => {
try out.writeAll("</span>\n<span class=\"line\">");
state = .normal;
},
else => {
try out.writeByte('/');
try escape_char(out, c);
state = .normal;
},
},
.maybe_docstring => switch (c) {
'\n' => {
// actually it was an empty comment lol cool
try out.writeAll("<span class=\"comment\">//</span></span>\n<span class=\"line\">");
state = .normal;
},
'/', '!' => {
// it is a docstring, so don't respan it
try out.writeAll("//");
try out.writeByte(c);
state = .normal;
},
else => {
// this is also a comment
try out.writeAll("<span class=\"comment\">//");
try escape_char(out, c);
state = .comment;
},
},
.comment => switch (c) {
'\n' => {
try out.writeAll("</span></span>\n<span class=\"line\">");
state = .normal;
},
else => {
try escape_char(out, c);
},
},
else => try out.writeByte(c),
}
}
}
// TODO: use more context to get better token resolution
// identifier preceded by (break | continue) colon is a label
// identifier followed by colon (inline | for | while | {) is a label
// identifier followed by colon (inline | for | while | l_brace) is a label
//
// identifier preceded by dot, not preceded by name, and followed by (, | => | == | != | rbrace | rparen | and | or | ;) is an enum literal
// identifier preceded by dot and followed by = is a struct field initializer
@ -51,20 +97,253 @@ fn write_escaped(out: anytype, input: []const u8) !void {
// identifier followed by { is a type
// identifier after | is a bind
pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytype) !void {
try out.writeAll("<pre class=\"code-markup\"><code class=\"lang-zig\"><span class=\"line\">");
const ContextToken = struct {
tag: std.zig.Token.Tag,
content: []const u8,
class: TokenClass = .needs_context,
};
const TokenClass = enum {
keyword,
string,
builtin,
type,
function,
label,
doc_comment,
literal_primitive,
literal_number,
symbology,
whitespace,
context_free,
needs_context,
pub fn name(self: @This()) []const u8 {
return switch (self) {
.doc_comment => "doc comment",
.literal_primitive => "literal primitive",
.literal_number => "literal number",
.symbology => "",
.context_free => "",
.whitespace => "",
.needs_context => @panic("too late"),
else => @tagName(self),
};
}
};
pub const ContextManager = struct {
// const Queue = std.TailQueue(ContextToken);
tokens: std.ArrayList(ContextToken),
allocator: std.mem.Allocator,
pub fn init(allocator: std.mem.Allocator) @This() {
return .{
.allocator = allocator,
.tokens = std.ArrayList(ContextToken).init(allocator),
};
}
pub fn deinit(self: *@This()) void {
self.tokens.deinit();
}
pub fn push_back(self: *@This(), token: ContextToken) !void {
try self.tokens.append(token);
}
fn print_span(content: []const u8, class: TokenClass, out: anytype) !void {
const classname = class.name();
if (classname.len > 0) {
try out.print("<span class=\"{s}\">", .{classname});
try write_escaped(out, content, class);
try out.writeAll("</span>");
} else {
try write_escaped(out, content, class);
}
}
fn print_fused_span(tokens: []ContextToken, start: usize, end: usize, out: anytype) !void {
const classname = tokens[start].class.name();
if (classname.len > 0) try out.print("<span class=\"{s}\">", .{classname});
for (tokens[start..end]) |*token| {
try write_escaped(out, token.content, tokens[start].class);
}
if (classname.len > 0) try out.writeAll("</span>");
}
pub fn process(self: *@This(), out: anytype) !void {
const tokens = self.tokens.items;
if (tokens.len == 0) return;
for (tokens, 0..) |*token, idx| {
if (token.class == .needs_context)
if (!contextualize_identifier(tokens, idx)) @panic("failed to context");
}
var idx: usize = 0;
while (idx < tokens.len) : (idx += 1) {
const span_start = idx;
const token = &tokens[idx];
// std.debug.print("tok {d}: {s} {}\n", .{ idx, token.content, token.class });
var lookahead = idx + 1;
while (lookahead < tokens.len) : (lookahead += 1) {
// std.debug.print("look {d}: {s} {}\n", .{ lookahead, tokens[lookahead].content, tokens[lookahead].class });
if (tokens[lookahead].class != .whitespace) {
if (tokens[lookahead].class == token.class)
idx = lookahead
else
break;
} else {
if (std.mem.containsAtLeast(u8, tokens[lookahead].content, 1, "\n")) break;
}
}
if (idx > span_start) {
try print_fused_span(tokens, span_start, idx + 1, out);
} else {
try print_span(token.content, token.class, out);
}
}
}
fn contextualize_identifier(tokens: []ContextToken, current: usize) bool {
return (contextualize_function(tokens, current) or
contextualize_builtin_type(tokens, current) or
contextualize_label(tokens, current) or
contextualize_fallback(tokens, current));
}
fn contextualize_function(tokens: []ContextToken, current: usize) bool {
const prev = prev_valid(tokens, current) orelse return false;
if (tokens[prev].tag == .keyword_fn) {
tokens[current].class = .function;
return true;
}
return false;
}
fn contextualize_builtin_type(tokens: []ContextToken, current: usize) bool {
const content = tokens[current].content;
const is_int = blk: {
if ((content[0] != 'i' and content[0] != 'u') or content.len < 2 or content.len > 6)
break :blk false;
for (content[1..]) |char|
if (char < '0' or char > '9') break :blk false;
break :blk true;
};
if (is_int or is_type(content)) {
tokens[current].class = .type;
return true;
}
return false;
}
fn contextualize_label(tokens: []ContextToken, current: usize) bool {
blk: {
const prev = prev_valid(tokens, current) orelse break :blk;
if (tokens[prev].tag == .colon) {
const prev2 = prev_valid(tokens, prev) orelse break :blk;
switch (tokens[prev2].tag) {
.keyword_break, .keyword_continue => {
tokens[prev].class = .label;
tokens[current].class = .label;
return true;
},
else => break :blk,
}
}
}
blk: {
const next = next_valid(tokens, current) orelse break :blk;
if (tokens[next].tag == .colon) {
const next2 = next_valid(tokens, next) orelse break :blk;
switch (tokens[next2].tag) {
.keyword_inline, .keyword_for, .keyword_while, .l_brace => {
tokens[current].class = .label;
tokens[next].class = .label;
return true;
},
else => break :blk,
}
}
}
return false;
}
fn contextualize_fallback(tokens: []ContextToken, current: usize) bool {
tokens[current].class = .context_free;
return true;
}
fn next_valid(tokens: []ContextToken, current: usize) ?usize {
var check = current + 1;
while (check < tokens.len) : (check += 1) {
if (tokens[check].class != .whitespace) return check;
}
return null;
}
fn prev_valid(tokens: []ContextToken, current: usize) ?usize {
if (current == 0) return null;
var check = current - 1;
while (check > 0) : (check -= 1) {
if (tokens[check].class != .whitespace) return check;
}
if (tokens[check].class != .whitespace) return check;
return null;
}
};
pub fn trimZ(comptime T: type, input: [:0]T, trimmer: []const T) [:0]T {
var begin: usize = 0;
var end: usize = input.len;
while (begin < end and std.mem.indexOfScalar(T, trimmer, input[begin]) != null) : (begin += 1) {}
while (end > begin and std.mem.indexOfScalar(T, trimmer, input[end - 1]) != null) : (end -= 1) {}
input[end] = 0;
return input[begin..end :0];
}
pub fn write_tokenized_html(raw_src: [:0]u8, allocator: std.mem.Allocator, out: anytype, full: bool) !void {
const src = trimZ(u8, raw_src, "\n");
var tokenizer = std.zig.Tokenizer.init(src);
var index: usize = 0;
var next_tok_is_fn = false;
var last_token_end: usize = 0;
if (full) try out.writeAll(html_preamble);
try out.writeAll("<pre class=\"code-markup\"><code class=\"lang-zig\"><span class=\"line\">");
var manager = ContextManager.init(allocator);
defer manager.deinit();
while (true) {
const prev_tok_was_fn = next_tok_is_fn;
next_tok_is_fn = false;
const token = tokenizer.next();
// short circuit on EOF to avoid
if (token.tag == .eof) break;
if (last_token_end < token.loc.start) {
try manager.push_back(.{
.tag = .invalid, // TODO: this is a big hack
.content = src[last_token_end..token.loc.start],
.class = .whitespace,
});
}
try write_escaped(out, src[index..token.loc.start]);
switch (token.tag) {
.eof => break,
@ -116,83 +395,72 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
.keyword_allowzero,
.keyword_while,
.keyword_anytype,
=> {
try out.writeAll("<span class=\"keyword\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
},
.keyword_fn,
=> try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .keyword,
}),
.keyword_fn => {
try out.writeAll("<span class=\"keyword\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
next_tok_is_fn = true;
},
.string_literal, .char_literal => {
try out.writeAll("<span class=\"string\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
},
.string_literal,
.char_literal,
=> try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .string,
}),
.multiline_string_literal_line => {
// multiline string literals contain a newline
try out.writeAll("<span class=\"string\">");
try write_escaped(out, src[token.loc.start .. token.loc.end - 1]);
try out.writeAll("</span></span>\n<span class=\"line\">");
try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start .. token.loc.end - 1],
.class = .string,
});
// multiline string literals contain a newline, but we don't want to
// tokenize it like that.
try manager.push_back(.{
.tag = .invalid,
.content = src[token.loc.end - 1 .. token.loc.end],
.class = .whitespace,
});
},
.builtin => {
try out.writeAll("<span class=\"builtin\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
},
.builtin => try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .builtin,
}),
.doc_comment,
.container_doc_comment,
=> {
try out.writeAll("<span class=\"comment\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .doc_comment,
});
},
.identifier => {
if (prev_tok_was_fn) {
try out.writeAll("<span class=\"function\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
print("function: {s}\n", .{src[token.loc.start..token.loc.end]});
} else {
print("identifier: {s}\n", .{src[token.loc.start..token.loc.end]});
const is_int = blk: {
if (src[token.loc.start] != 'i' and src[token.loc.start] != 'u')
break :blk false;
var i = token.loc.start + 1;
if (i == token.loc.end)
break :blk false;
while (i != token.loc.end) : (i += 1) {
if (src[i] < '0' or src[i] > '9')
break :blk false;
}
break :blk true;
};
if (is_int or is_type(src[token.loc.start..token.loc.end])) {
try out.writeAll("<span class=\"type\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
} else {
try out.writeAll("<span class=\"name\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
}
}
const content = src[token.loc.start..token.loc.end];
try manager.push_back(.{
.tag = token.tag,
.content = content,
.class = if (mem.eql(u8, content, "undefined") or
mem.eql(u8, content, "null") or
mem.eql(u8, content, "true") or
mem.eql(u8, content, "false"))
.literal_primitive
else
.needs_context,
});
},
.number_literal => {
try out.writeAll("<span class=\"literal number\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
try out.writeAll("</span>");
},
.number_literal => try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .literal_number,
}),
.bang,
.pipe,
@ -256,21 +524,24 @@ pub fn write_tokenized_html(src: [:0]const u8, _: std.mem.Allocator, out: anytyp
.asterisk_pipe_equal,
.angle_bracket_angle_bracket_left_pipe,
.angle_bracket_angle_bracket_left_pipe_equal,
=> {
// try out.writeAll("<span class=\"symbol\">");
try write_escaped(out, src[token.loc.start..token.loc.end]);
// try out.writeAll("</span>");
},
.invalid, .invalid_periodasterisks => return parseError(
src,
token,
"syntax error",
.{},
),
=> try manager.push_back(.{
.tag = token.tag,
.content = src[token.loc.start..token.loc.end],
.class = .symbology,
}),
.invalid,
.invalid_periodasterisks,
=> return parseError(src, token, "syntax error", .{}),
}
index = token.loc.end;
last_token_end = token.loc.end;
}
try manager.process(out);
try out.writeAll("</span></code></pre>");
if (full) try out.writeAll(html_epilogue);
}
// TODO: this function returns anyerror, interesting
@ -322,6 +593,7 @@ const Location = struct {
line_start: usize,
line_end: usize,
};
fn getTokenLocation(src: []const u8, token: std.zig.Token) Location {
var loc = Location{
.line = 0,
@ -346,6 +618,53 @@ fn getTokenLocation(src: []const u8, token: std.zig.Token) Location {
return loc;
}
const html_preamble =
\\<!DOCTYPE html>
\\<html>
\\ <head>
\\ <style>
\\:root {
\\ --background: #2D2D2D;
\\ --foreground: #D3D0C8;
\\ --red: #F2777A;
\\ --orange: #F99157;
\\ --yellow: #FFCC66;
\\ --green: #99CC99;
\\ --aqua: #66CCCC;
\\ --blue: #6699CC;
\\ --purple: #CC99CC;
\\ --pink: #FFCCFF;
\\ --gray: #747369;
\\}
\\body {
\\ background: var(--background);
\\ color: var(--foreground);
\\}
\\.code-markup {
\\ padding: 0;
\\ font-size: 16pt;
\\ line-height: 1.1;
\\}
\\.code-markup .keyword { color: var(--purple); }
\\.code-markup .type { color: var(--purple); }
\\.code-markup .builtin { color: var(--aqua); }
\\.code-markup .string { color: var(--green); }
\\.code-markup .comment { color: var(--gray); }
\\.code-markup .literal { color: var(--orange); }
\\.code-markup .name { color: var(--red); }
\\.code-markup .function { color: var(--blue); }
\\.code-markup .label { color: var(--yellow); }
\\ </style>
\\ </head>
\\ <body>
;
const html_epilogue =
\\
\\ </body>
\\</html>
;
const tokenator = cmd: {
var cmd = noclip.CommandBuilder(TokCtx){
.description =
@ -353,7 +672,10 @@ const tokenator = cmd: {
\\
\\Each file provided on the command line will be tokenized and the output will
\\be written to [filename].html. For example, 'tokenator foo.zig bar.zig' will
\\write foo.zig.html and bar.zig.html
\\write foo.zig.html and bar.zig.html. Files are written directly, and if an
\\error occurs while processing a file, partial output will occur. When
\\processing multiple files, a failure will exit without processing any
\\successive files. Inputs should be less than 1MB in size.
\\
\\If the --stdout flag is provided, output will be written to the standard
\\output instead of to named files. Each file written to stdout will be
@ -366,6 +688,12 @@ const tokenator = cmd: {
.default = false,
.description = "write output to stdout instead of to files",
});
cmd.simple_flag(.{
.name = "full",
.truthy = .{ .short_tag = "-f", .long_tag = "--full" },
.default = false,
.description = "write full HTML files rather than just the pre fragment",
});
cmd.add_argument(.{ .OutputType = []const u8, .multi = true }, .{ .name = "files" });
break :cmd cmd;
};
@ -390,14 +718,10 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void {
0,
);
};
defer context.allocator.free(srcbuf);
var writebuf = std.ArrayList(u8).init(context.allocator);
defer writebuf.deinit();
if (parameters.write_stdout) {
try write_tokenized_html(srcbuf, context.allocator, stdout);
try write_tokenized_html(srcbuf, context.allocator, stdout, parameters.full);
try stdout.writeByte(0);
} else {
const outname = try std.mem.join(context.allocator, ".", &[_][]const u8{ file_name, "html" });
@ -407,7 +731,7 @@ fn tokenize_files(context: *TokCtx, parameters: tokenator.Output()) !void {
const output = try fs.cwd().createFile(outname, .{});
defer output.close();
try write_tokenized_html(srcbuf, context.allocator, output.writer());
try write_tokenized_html(srcbuf, context.allocator, output.writer(), parameters.full);
}
}
}