Once again I have entangled two conceptually distinct changes into a single commit because demuxing them from the diff is too much work. Alas. Let's break it down. The simpler part of this change is to reintroduce "space strings" with a slightly fresh coat of paint. We now have 3 different types of string leaders that can be used together. So we now have: | directly concatenates this line with the previous line > prepends an LF character before concatenation + (NEW) prepends a single space character before concatenation The `+` leader enables more æsthetic soft line wrapping than `|` because it doesn't require the use of leading or trailing the whitespace to separate words, as long as lines are broken at word boundaries. Perhaps this is not as common a usecase as I am making it, but I do like to hard wrap paragraphs in documents, so if anything, it's a feature for me. As I was considering what character to use for this leader, I realized that I wanted to be able to support numeric map keys, a la: -1: negative one 0: zero +1: positive one But previously this would not parse correctly, as the tokenizer would find `-` and expect it to be followed by a space to indicate a list item (and the additional string leader would cause the same problem with `+`). I wanted to support this use case, so the parser was changed to take a second pass on lines starting with the string leaders (`|`, `+`, and `>`) and the list item leader (`-`) if the leader has a non-space character following it. Note that this does not apply to the comment leader (`#` not followed by a space or a newline is a tokenization error) or to the inline list/map leaders(since those do not respect internal whitespace, there is no way to treat them unambiguously). To reduce the likelihood of confusing documents, scalars are no longer allowed to occupy their own line (the exception to this is if the document consists only of a scalar value). Inline lists and maps can still occupy their own line, though I am considering changing this as well to force them to truly be inline. I think this change makes sense, as scalars are generally intended to be represent an unbroken single item serialization of some non-string value. In other words, # these two lines used to parse the same way key: 9001 # but now the following line is a parse error due to the scalar # occupying its own line key: 9001 # also, this still works, but it may be changed to be an error in # the future key: [ 9, 0, 0, 1 ] Inline maps have also been changed so that their keys can start with the now-unforbidden string leaders and list item leader characters.
355 lines
15 KiB
Zig
355 lines
15 KiB
Zig
const std = @import("std");
|
|
|
|
const Diagnostics = @import("./parser.zig").Diagnostics;
|
|
|
|
pub const Error = error{
|
|
BadToken,
|
|
ExtraContent,
|
|
MixedIndentation,
|
|
TooMuchIndentation,
|
|
UnquantizedIndentation,
|
|
TrailingWhitespace,
|
|
IllegalTabWhitespaceInLine,
|
|
Impossible,
|
|
};
|
|
|
|
pub const DetectedIndentation = union(enum) {
|
|
unknown: void,
|
|
spaces: usize,
|
|
tabs: void,
|
|
};
|
|
|
|
pub const InlineItem = union(enum) {
|
|
empty: void,
|
|
scalar: []const u8,
|
|
line_string: []const u8,
|
|
space_string: []const u8,
|
|
concat_string: []const u8,
|
|
|
|
inline_list: []const u8,
|
|
inline_map: []const u8,
|
|
};
|
|
|
|
pub const LineContents = union(enum) {
|
|
comment: []const u8,
|
|
|
|
in_line: InlineItem,
|
|
list_item: InlineItem,
|
|
map_item: struct { key: []const u8, val: InlineItem },
|
|
};
|
|
|
|
pub const ShiftDirection = enum { indent, dedent, none };
|
|
|
|
pub const LineShift = union(ShiftDirection) {
|
|
indent: void,
|
|
// we can dedent multiple levels at once.
|
|
dedent: usize,
|
|
none: void,
|
|
};
|
|
|
|
pub const Line = struct {
|
|
shift: LineShift,
|
|
contents: LineContents,
|
|
raw: []const u8,
|
|
};
|
|
|
|
// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
|
|
// technically be anything with a conformant interface.
|
|
pub fn LineTokenizer(comptime Buffer: type) type {
|
|
return struct {
|
|
buffer: Buffer,
|
|
index: usize = 0,
|
|
indentation: DetectedIndentation = .unknown,
|
|
last_indent: usize = 0,
|
|
|
|
pub fn finish(self: @This()) !void {
|
|
if (!self.buffer.empty()) {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "the document has extra content or is missing the final LF character";
|
|
return error.ExtraContent;
|
|
}
|
|
}
|
|
|
|
pub fn next(self: *@This()) !?Line {
|
|
lineloop: while (try self.buffer.nextLine()) |raw_line| {
|
|
var indent: usize = 0;
|
|
for (raw_line, 0..) |char, idx| {
|
|
switch (char) {
|
|
' ' => {
|
|
switch (self.indentation) {
|
|
// There's a weird coupling here because we can't set this until
|
|
// all spaces have been consumed. I also thought about ignoring
|
|
// spaces on comment lines since those don't affect the
|
|
// relative indent/dedent, but then we would allow comments
|
|
// to ignore our indent quantum, which I dislike due to it making
|
|
// ugly documents.
|
|
.unknown => self.indentation = .{ .spaces = 0 },
|
|
.spaces => {},
|
|
.tabs => {
|
|
self.buffer.diag().line_offset = idx;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "the document contains mixed tab/space indentation";
|
|
return error.MixedIndentation;
|
|
},
|
|
}
|
|
},
|
|
'\t' => {
|
|
switch (self.indentation) {
|
|
.unknown => self.indentation = .tabs,
|
|
.spaces => {
|
|
self.buffer.diag().line_offset = idx;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "the document contains mixed tab/space indentation";
|
|
return error.MixedIndentation;
|
|
},
|
|
.tabs => {},
|
|
}
|
|
},
|
|
'\r' => {
|
|
return error.BadToken;
|
|
},
|
|
else => {
|
|
indent = idx;
|
|
break;
|
|
},
|
|
}
|
|
} else {
|
|
if (raw_line.len > 0) {
|
|
self.buffer.diag().line_offset = raw_line.len - 1;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
|
return error.TrailingWhitespace;
|
|
}
|
|
continue :lineloop;
|
|
}
|
|
|
|
var quantized: usize = if (self.indentation == .spaces) quant: {
|
|
if (self.indentation.spaces == 0) {
|
|
self.indentation.spaces = indent;
|
|
}
|
|
if (@rem(indent, self.indentation.spaces) != 0) {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = indent;
|
|
self.buffer.diag().message = "this line contains incorrectly quantized indentation";
|
|
return error.UnquantizedIndentation;
|
|
}
|
|
|
|
break :quant @divExact(indent, self.indentation.spaces);
|
|
} else indent;
|
|
|
|
const shift: LineShift = if (quantized > self.last_indent) rel: {
|
|
if ((quantized - self.last_indent) > 1) {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = indent;
|
|
self.buffer.diag().message = "this line contains too much indentation";
|
|
return error.TooMuchIndentation;
|
|
}
|
|
break :rel .indent;
|
|
} else if (quantized < self.last_indent)
|
|
.{ .dedent = self.last_indent - quantized }
|
|
else
|
|
.none;
|
|
|
|
defer {
|
|
self.last_indent = quantized;
|
|
}
|
|
|
|
// update the diagnostics so that the parser can use them without
|
|
// knowing about the whitespace.
|
|
self.buffer.diag().line_offset = indent;
|
|
const line = raw_line[indent..];
|
|
|
|
// this should not be possible, as empty lines are caught earlier.
|
|
if (line.len == 0) return error.Impossible;
|
|
|
|
sigil: {
|
|
switch (line[0]) {
|
|
'#' => {
|
|
// Force comments to be followed by a space. We could
|
|
// allow #: to be interpreted as a map key, but I'm going
|
|
// to specifically forbid it instead.
|
|
if (line.len > 1 and line[1] != ' ') {
|
|
self.buffer.diag().line_offset += 1;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
|
|
return error.BadToken;
|
|
}
|
|
|
|
// simply lie about indentation when the line is a comment.
|
|
quantized = self.last_indent;
|
|
return .{
|
|
.shift = .none,
|
|
.contents = .{ .comment = line[1..] },
|
|
.raw = line,
|
|
};
|
|
},
|
|
'|', '>', '+' => {
|
|
if (line.len > 1 and line[1] != ' ') {
|
|
// we want to try parsing this as a map key
|
|
break :sigil;
|
|
}
|
|
|
|
return .{
|
|
.shift = shift,
|
|
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
|
.raw = line,
|
|
};
|
|
},
|
|
'[', '{' => {
|
|
// these don't require being followed by a space, so they
|
|
// cannot be interpreted as starting a map key in any way.
|
|
return .{
|
|
.shift = shift,
|
|
.contents = .{ .in_line = try self.detectInlineItem(line) },
|
|
.raw = line,
|
|
};
|
|
},
|
|
'-' => {
|
|
if (line.len > 1 and line[1] != ' ') {
|
|
// we want to try parsing this as a map key
|
|
break :sigil;
|
|
}
|
|
|
|
// blindly add 2 here because an empty item cannot fail in
|
|
// the value, only if a bogus dedent has occurred
|
|
self.buffer.diag().line_offset += 2;
|
|
|
|
return if (line.len == 1) .{
|
|
.shift = shift,
|
|
.contents = .{ .list_item = .empty },
|
|
.raw = line,
|
|
} else .{
|
|
.shift = shift,
|
|
.contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
|
|
.raw = line,
|
|
};
|
|
},
|
|
else => break :sigil,
|
|
}
|
|
}
|
|
|
|
for (line, 0..) |char, idx| {
|
|
if (char == ':') {
|
|
if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
|
|
self.buffer.diag().line_offset += idx - 1;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
|
|
return error.TrailingWhitespace;
|
|
}
|
|
|
|
if (idx + 1 == line.len) {
|
|
self.buffer.diag().line_offset += idx + 1;
|
|
return .{
|
|
.shift = shift,
|
|
.contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
|
|
.raw = line,
|
|
};
|
|
}
|
|
|
|
if (line[idx + 1] != ' ') {
|
|
self.buffer.diag().line_offset += idx + 1;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
|
|
return error.BadToken;
|
|
}
|
|
|
|
return .{
|
|
.shift = shift,
|
|
.contents = .{ .map_item = .{
|
|
.key = line[0..idx],
|
|
.val = try self.detectInlineItem(line[idx + 2 ..]),
|
|
} },
|
|
.raw = line,
|
|
};
|
|
}
|
|
}
|
|
|
|
return .{
|
|
.shift = shift,
|
|
.contents = .{ .in_line = .{ .scalar = line } },
|
|
.raw = line,
|
|
};
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// TODO: it's impossible to get the right diagnostic offset in this function at the moment
|
|
fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
|
|
if (buf.len == 0) return .empty;
|
|
|
|
const start = start: {
|
|
for (buf, 0..) |chr, idx|
|
|
if (chr == ' ')
|
|
continue
|
|
else if (chr == '\t')
|
|
return error.IllegalTabWhitespaceInLine
|
|
else
|
|
break :start idx;
|
|
|
|
return error.TrailingWhitespace;
|
|
};
|
|
|
|
switch (buf[start]) {
|
|
'>', '|', '+' => |char| {
|
|
if (buf.len - start > 1 and buf[start + 1] != ' ') {
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line is missing a space after the string start character";
|
|
return error.BadToken;
|
|
}
|
|
|
|
const slice: []const u8 = switch (buf[buf.len - 1]) {
|
|
' ', '\t' => {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
|
return error.TrailingWhitespace;
|
|
},
|
|
'|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
|
|
else => buf[start + @min(2, buf.len - start) .. buf.len],
|
|
};
|
|
|
|
return switch (char) {
|
|
'>' => .{ .line_string = slice },
|
|
'+' => .{ .space_string = slice },
|
|
'|' => .{ .concat_string = slice },
|
|
else => unreachable,
|
|
};
|
|
},
|
|
'[' => {
|
|
if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains a inline list but does not end with the closing character ']'";
|
|
return error.BadToken;
|
|
}
|
|
|
|
// keep the closing ] for the inline parser
|
|
return .{ .inline_list = buf[start + 1 ..] };
|
|
},
|
|
'{' => {
|
|
if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains a inline map but does not end with the closing character '}'";
|
|
return error.BadToken;
|
|
}
|
|
|
|
// keep the closing } for the inline parser
|
|
return .{ .inline_map = buf[start + 1 ..] };
|
|
},
|
|
else => {
|
|
if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
|
|
self.buffer.diag().line_offset = 0;
|
|
self.buffer.diag().length = 1;
|
|
self.buffer.diag().message = "this line contains trailing whitespace";
|
|
return error.TrailingWhitespace;
|
|
}
|
|
|
|
return .{ .scalar = buf[start..] };
|
|
},
|
|
}
|
|
}
|
|
};
|
|
}
|