nice-data/src/tokenizer.zig
torque 258cf2ae83
parser: reintroduce space strings and change token parsing strategy
I don't think I have the wherewithal to write this full commit message
right now. Since it should be a long one.

Basically. `+ ` now is the string space concatenation operator because
that is a very common use case. It's essentially the soft-wrap
character.

Also, lines starting with -, +, >, and | will now try to tokenize as
map keys if they do not contain the following space. The motivation
here is numeric map keys. Specifically, +/- are numeric leaders.

To facilitate this change, own-line scalars are now prohibited. So, for
example:

    key: -1000

is still fine, but

    key:
        -1000

is no longer accepted.
2023-10-18 00:20:19 -07:00

355 lines
15 KiB
Zig

const std = @import("std");
const Diagnostics = @import("./parser.zig").Diagnostics;
pub const Error = error{
BadToken,
ExtraContent,
MixedIndentation,
TooMuchIndentation,
UnquantizedIndentation,
TrailingWhitespace,
IllegalTabWhitespaceInLine,
Impossible,
};
pub const DetectedIndentation = union(enum) {
unknown: void,
spaces: usize,
tabs: void,
};
pub const InlineItem = union(enum) {
empty: void,
scalar: []const u8,
line_string: []const u8,
space_string: []const u8,
concat_string: []const u8,
inline_list: []const u8,
inline_map: []const u8,
};
pub const LineContents = union(enum) {
comment: []const u8,
in_line: InlineItem,
list_item: InlineItem,
map_item: struct { key: []const u8, val: InlineItem },
};
pub const ShiftDirection = enum { indent, dedent, none };
pub const LineShift = union(ShiftDirection) {
indent: void,
// we can dedent multiple levels at once.
dedent: usize,
none: void,
};
pub const Line = struct {
shift: LineShift,
contents: LineContents,
raw: []const u8,
};
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
// technically be anything with a conformant interface.
pub fn LineTokenizer(comptime Buffer: type) type {
return struct {
buffer: Buffer,
index: usize = 0,
indentation: DetectedIndentation = .unknown,
last_indent: usize = 0,
pub fn finish(self: @This()) !void {
if (!self.buffer.empty()) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
return error.ExtraContent;
}
}
pub fn next(self: *@This()) !?Line {
lineloop: while (try self.buffer.nextLine()) |raw_line| {
var indent: usize = 0;
for (raw_line, 0..) |char, idx| {
switch (char) {
' ' => {
switch (self.indentation) {
// There's a weird coupling here because we can't set this until
// all spaces have been consumed. I also thought about ignoring
// spaces on comment lines since those don't affect the
// relative indent/dedent, but then we would allow comments
// to ignore our indent quantum, which I dislike due to it making
// ugly documents.
.unknown => self.indentation = .{ .spaces = 0 },
.spaces => {},
.tabs => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
}
},
'\t' => {
switch (self.indentation) {
.unknown => self.indentation = .tabs,
.spaces => {
self.buffer.diag().line_offset = idx;
self.buffer.diag().length = 1;
self.buffer.diag().message = "the document contains mixed tab/space indentation";
return error.MixedIndentation;
},
.tabs => {},
}
},
'\r' => {
return error.BadToken;
},
else => {
indent = idx;
break;
},
}
} else {
if (raw_line.len > 0) {
self.buffer.diag().line_offset = raw_line.len - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
}
continue :lineloop;
}
var quantized: usize = if (self.indentation == .spaces) quant: {
if (self.indentation.spaces == 0) {
self.indentation.spaces = indent;
}
if (@rem(indent, self.indentation.spaces) != 0) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
return error.UnquantizedIndentation;
}
break :quant @divExact(indent, self.indentation.spaces);
} else indent;
const shift: LineShift = if (quantized > self.last_indent) rel: {
if ((quantized - self.last_indent) > 1) {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = indent;
self.buffer.diag().message = "this line contains too much indentation";
return error.TooMuchIndentation;
}
break :rel .indent;
} else if (quantized < self.last_indent)
.{ .dedent = self.last_indent - quantized }
else
.none;
defer {
self.last_indent = quantized;
}
// update the diagnostics so that the parser can use them without
// knowing about the whitespace.
self.buffer.diag().line_offset = indent;
const line = raw_line[indent..];
// this should not be possible, as empty lines are caught earlier.
if (line.len == 0) return error.Impossible;
sigil: {
switch (line[0]) {
'#' => {
// Force comments to be followed by a space. We could
// allow #: to be interpreted as a map key, but I'm going
// to specifically forbid it instead.
if (line.len > 1 and line[1] != ' ') {
self.buffer.diag().line_offset += 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
return error.BadToken;
}
// simply lie about indentation when the line is a comment.
quantized = self.last_indent;
return .{
.shift = .none,
.contents = .{ .comment = line[1..] },
.raw = line,
};
},
'|', '>', '+' => {
if (line.len > 1 and line[1] != ' ') {
// we want to try parsing this as a map key
break :sigil;
}
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line,
};
},
'[', '{' => {
// these don't require being followed by a space, so they
// cannot be interpreted as starting a map key in any way.
return .{
.shift = shift,
.contents = .{ .in_line = try self.detectInlineItem(line) },
.raw = line,
};
},
'-' => {
if (line.len > 1 and line[1] != ' ') {
// we want to try parsing this as a map key
break :sigil;
}
// blindly add 2 here because an empty item cannot fail in
// the value, only if a bogus dedent has occurred
self.buffer.diag().line_offset += 2;
return if (line.len == 1) .{
.shift = shift,
.contents = .{ .list_item = .empty },
.raw = line,
} else .{
.shift = shift,
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
.raw = line,
};
},
else => break :sigil,
}
}
for (line, 0..) |char, idx| {
if (char == ':') {
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
self.buffer.diag().line_offset += idx - 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
return error.TrailingWhitespace;
}
if (idx + 1 == line.len) {
self.buffer.diag().line_offset += idx + 1;
return .{
.shift = shift,
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
.raw = line,
};
}
if (line[idx + 1] != ' ') {
self.buffer.diag().line_offset += idx + 1;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
return error.BadToken;
}
return .{
.shift = shift,
.contents = .{ .map_item = .{
.key = line[0..idx],
.val = try self.detectInlineItem(line[idx + 2 ..]),
} },
.raw = line,
};
}
}
return .{
.shift = shift,
.contents = .{ .in_line = .{ .scalar = line } },
.raw = line,
};
}
return null;
}
// TODO: it's impossible to get the right diagnostic offset in this function at the moment
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
if (buf.len == 0) return .empty;
const start = start: {
for (buf, 0..) |chr, idx|
if (chr == ' ')
continue
else if (chr == '\t')
return error.IllegalTabWhitespaceInLine
else
break :start idx;
return error.TrailingWhitespace;
};
switch (buf[start]) {
'>', '|', '+' => |char| {
if (buf.len - start > 1 and buf[start + 1] != ' ') {
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line is missing a space after the string start character";
return error.BadToken;
}
const slice: []const u8 = switch (buf[buf.len - 1]) {
' ', '\t' => {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
},
'|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
else => buf[start + @min(2, buf.len - start) .. buf.len],
};
return switch (char) {
'>' => .{ .line_string = slice },
'+' => .{ .space_string = slice },
'|' => .{ .concat_string = slice },
else => unreachable,
};
},
'[' => {
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a inline list but does not end with the closing character ']'";
return error.BadToken;
}
// keep the closing ] for the inline parser
return .{ .inline_list = buf[start + 1 ..] };
},
'{' => {
if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains a inline map but does not end with the closing character '}'";
return error.BadToken;
}
// keep the closing } for the inline parser
return .{ .inline_map = buf[start + 1 ..] };
},
else => {
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
self.buffer.diag().line_offset = 0;
self.buffer.diag().length = 1;
self.buffer.diag().message = "this line contains trailing whitespace";
return error.TrailingWhitespace;
}
return .{ .scalar = buf[start..] };
},
}
}
};
}