nice-data/src/tokenizer.zig

const std = @import("std");

const Diagnostics = @import("./parser.zig").Diagnostics;

pub const Error = error{
    BadToken,
    ExtraContent,
    MixedIndentation,
    TooMuchIndentation,
    UnquantizedIndentation,
    TrailingWhitespace,
    IllegalTabWhitespaceInLine,
    Impossible,
};

pub const DetectedIndentation = union(enum) {
    unknown: void,
    spaces: usize,
    tabs: void,
};

pub const InlineItem = union(enum) {
    empty: void,
    scalar: []const u8,
    line_string: []const u8,
    space_string: []const u8,
    concat_string: []const u8,

    inline_list: []const u8,
    inline_map: []const u8,
};

pub const LineContents = union(enum) {
    comment: []const u8,

    in_line: InlineItem,
    list_item: InlineItem,
    map_item: struct { key: []const u8, val: InlineItem },
};

pub const ShiftDirection = enum { indent, dedent, none };

pub const LineShift = union(ShiftDirection) {
    indent: void,
    // we can dedent multiple levels at once.
    dedent: usize,
    none: void,
};

pub const Line = struct {
    shift: LineShift,
    contents: LineContents,
    raw: []const u8,
};

// buffer is expected to be either LineBuffer or FixedLineBuffer, but can
// technically be anything with a conformant interface.
pub fn LineTokenizer(comptime Buffer: type) type {
    return struct {
        buffer: Buffer,
        index: usize = 0,
        indentation: DetectedIndentation = .unknown,
        last_indent: usize = 0,

        pub fn finish(self: @This()) !void {
            if (!self.buffer.empty()) {
                self.buffer.diag().line_offset = 0;
                self.buffer.diag().length = 1;
                self.buffer.diag().message = "the document has extra content or is missing the final LF character";
                return error.ExtraContent;
            }
        }

        pub fn next(self: *@This()) !?Line {
            lineloop: while (try self.buffer.nextLine()) |raw_line| {
                var indent: usize = 0;
                for (raw_line, 0..) |char, idx| {
                    switch (char) {
                        ' ' => {
                            switch (self.indentation) {
                                // There's a weird coupling here because we can't set this until
                                // all spaces have been consumed. I also thought about ignoring
                                // spaces on comment lines since those don't affect the
                                // relative indent/dedent, but then we would allow comments
                                // to ignore our indent quantum, which I dislike due to it making
                                // ugly documents.
                                .unknown => self.indentation = .{ .spaces = 0 },
                                .spaces => {},
                                .tabs => {
                                    self.buffer.diag().line_offset = idx;
                                    self.buffer.diag().length = 1;
                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
                                    return error.MixedIndentation;
                                },
                            }
                        },
                        '\t' => {
                            switch (self.indentation) {
                                .unknown => self.indentation = .tabs,
                                .spaces => {
                                    self.buffer.diag().line_offset = idx;
                                    self.buffer.diag().length = 1;
                                    self.buffer.diag().message = "the document contains mixed tab/space indentation";
                                    return error.MixedIndentation;
                                },
                                .tabs => {},
                            }
                        },
                        '\r' => {
                            return error.BadToken;
                        },
                        else => {
                            indent = idx;
                            break;
                        },
                    }
                } else {
                    if (raw_line.len > 0) {
                        self.buffer.diag().line_offset = raw_line.len - 1;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains trailing whitespace";
                        return error.TrailingWhitespace;
                    }
                    continue :lineloop;
                }

                var quantized: usize = if (self.indentation == .spaces) quant: {
                    if (self.indentation.spaces == 0) {
                        self.indentation.spaces = indent;
                    }
                    if (@rem(indent, self.indentation.spaces) != 0) {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = indent;
                        self.buffer.diag().message = "this line contains incorrectly quantized indentation";
                        return error.UnquantizedIndentation;
                    }

                    break :quant @divExact(indent, self.indentation.spaces);
                } else indent;

                const shift: LineShift = if (quantized > self.last_indent) rel: {
                    if ((quantized - self.last_indent) > 1) {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = indent;
                        self.buffer.diag().message = "this line contains too much indentation";
                        return error.TooMuchIndentation;
                    }
                    break :rel .indent;
                } else if (quantized < self.last_indent)
                    .{ .dedent = self.last_indent - quantized }
                else
                    .none;

                defer {
                    self.last_indent = quantized;
                }

                // update the diagnostics so that the parser can use them without
                // knowing about the whitespace.
                self.buffer.diag().line_offset = indent;
                const line = raw_line[indent..];

                // this should not be possible, as empty lines are caught earlier.
                if (line.len == 0) return error.Impossible;

                sigil: {
                    switch (line[0]) {
                        '#' => {
                            // Force comments to be followed by a space. We could
                            // allow #: to be interpreted as a map key, but I'm going
                            // to specifically forbid it instead.
                            if (line.len > 1 and line[1] != ' ') {
                                self.buffer.diag().line_offset += 1;
                                self.buffer.diag().length = 1;
                                self.buffer.diag().message = "this line is missing a space after the start of comment character '#'";
                                return error.BadToken;
                            }

                            // simply lie about indentation when the line is a comment.
                            quantized = self.last_indent;
                            return .{
                                .shift = .none,
                                .contents = .{ .comment = line[1..] },
                                .raw = line,
                            };
                        },
                        '|', '>', '+' => {
                            if (line.len > 1 and line[1] != ' ') {
                                // we want to try parsing this as a map key
                                break :sigil;
                            }

                            return .{
                                .shift = shift,
                                .contents = .{ .in_line = try self.detectInlineItem(line) },
                                .raw = line,
                            };
                        },
                        '[', '{' => {
                            // these don't require being followed by a space, so they
                            // cannot be interpreted as starting a map key in any way.
                            return .{
                                .shift = shift,
                                .contents = .{ .in_line = try self.detectInlineItem(line) },
                                .raw = line,
                            };
                        },
                        '-' => {
                            if (line.len > 1 and line[1] != ' ') {
                                // we want to try parsing this as a map key
                                break :sigil;
                            }

                            // blindly add 2 here because an empty item cannot fail in
                            // the value, only if a bogus dedent has occurred
                            self.buffer.diag().line_offset += 2;

                            return if (line.len == 1) .{
                                .shift = shift,
                                .contents = .{ .list_item = .empty },
                                .raw = line,
                            } else .{
                                .shift = shift,
                                .contents = .{ .list_item = try self.detectInlineItem(line[2..]) },
                                .raw = line,
                            };
                        },
                        else => break :sigil,
                    }
                }

                for (line, 0..) |char, idx| {
                    if (char == ':') {
                        if (idx > 0 and (line[idx - 1] == ' ' or line[idx - 1] == '\t')) {
                            self.buffer.diag().line_offset += idx - 1;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line contains space before the map key-value separator character ':'";
                            return error.TrailingWhitespace;
                        }

                        if (idx + 1 == line.len) {
                            self.buffer.diag().line_offset += idx + 1;
                            return .{
                                .shift = shift,
                                .contents = .{ .map_item = .{ .key = line[0..idx], .val = .empty } },
                                .raw = line,
                            };
                        }

                        if (line[idx + 1] != ' ') {
                            self.buffer.diag().line_offset += idx + 1;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line is missing a space after the map key-value separator character ':'";
                            return error.BadToken;
                        }

                        return .{
                            .shift = shift,
                            .contents = .{ .map_item = .{
                                .key = line[0..idx],
                                .val = try self.detectInlineItem(line[idx + 2 ..]),
                            } },
                            .raw = line,
                        };
                    }
                }

                return .{
                    .shift = shift,
                    .contents = .{ .in_line = .{ .scalar = line } },
                    .raw = line,
                };
            }
            return null;
        }

        // TODO: it's impossible to get the right diagnostic offset in this function at the moment
        fn detectInlineItem(self: @This(), buf: []const u8) Error!InlineItem {
            if (buf.len == 0) return .empty;

            const start = start: {
                for (buf, 0..) |chr, idx|
                    if (chr == ' ')
                        continue
                    else if (chr == '\t')
                        return error.IllegalTabWhitespaceInLine
                    else
                        break :start idx;

                return error.TrailingWhitespace;
            };

            switch (buf[start]) {
                '>', '|', '+' => |char| {
                    if (buf.len - start > 1 and buf[start + 1] != ' ') {
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line is missing a space after the string start character";
                        return error.BadToken;
                    }

                    const slice: []const u8 = switch (buf[buf.len - 1]) {
                        ' ', '\t' => {
                            self.buffer.diag().line_offset = 0;
                            self.buffer.diag().length = 1;
                            self.buffer.diag().message = "this line contains trailing whitespace";
                            return error.TrailingWhitespace;
                        },
                        '|' => buf[start + @min(2, buf.len - start) .. buf.len - @intFromBool(buf.len - start > 1)],
                        else => buf[start + @min(2, buf.len - start) .. buf.len],
                    };

                    return switch (char) {
                        '>' => .{ .line_string = slice },
                        '+' => .{ .space_string = slice },
                        '|' => .{ .concat_string = slice },
                        else => unreachable,
                    };
                },
                '[' => {
                    if (buf.len - start < 2 or buf[buf.len - 1] != ']') {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a inline list but does not end with the closing character ']'";
                        return error.BadToken;
                    }

                    // keep the closing ] for the inline parser
                    return .{ .inline_list = buf[start + 1 ..] };
                },
                '{' => {
                    if (buf.len - start < 2 or buf[buf.len - 1] != '}') {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains a inline map but does not end with the closing character '}'";
                        return error.BadToken;
                    }

                    // keep the closing } for the inline parser
                    return .{ .inline_map = buf[start + 1 ..] };
                },
                else => {
                    if (buf[buf.len - 1] == ' ' or buf[buf.len - 1] == '\t') {
                        self.buffer.diag().line_offset = 0;
                        self.buffer.diag().length = 1;
                        self.buffer.diag().message = "this line contains trailing whitespace";
                        return error.TrailingWhitespace;
                    }

                    return .{ .scalar = buf[start..] };
                },
            }
        }
    };
}