From 2d099d931ed256805ec5380511ac306c4e53651c Mon Sep 17 00:00:00 2001 From: torque Date: Mon, 4 Sep 2023 15:22:02 -0700 Subject: [PATCH] start doodling some wrapper ideas The main points of interest here is the allocator wrapper, which was kind of annoying to implement. Since the Zig allocator interface expects to know the size of all allocations, we have to secretly allocate space for that in front of the pointer that we return. This ends up causing a little bit of slightly nasty pointer manipulation. I took longer to debug this than I would like to admit from a bug that ended up being caused by storing a pointer to a stack copy of an Allocator object. Whoops. The other interesting thing I think (though much simpler) is representing the options bitfield as a packed struct. I think this makes initializing the parser options a lot nicer than the c variant. --- src/cmark.zig | 352 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) diff --git a/src/cmark.zig b/src/cmark.zig index e69de29..241d0d2 100644 --- a/src/cmark.zig +++ b/src/cmark.zig @@ -0,0 +1,352 @@ +const std = @import("std"); + +pub const cmark = @cImport({ + @cInclude("cmark.h"); +}); + +const Failed = error.Failed; + +pub const NodeType = enum(c_int) { + none = cmark.CMARK_NODE_NONE, + + // Block + document = cmark.CMARK_NODE_DOCUMENT, + block_quote = cmark.CMARK_NODE_BLOCK_QUOTE, + list = cmark.CMARK_NODE_LIST, + item = cmark.CMARK_NODE_ITEM, + code_block = cmark.CMARK_NODE_CODE_BLOCK, + html_block = cmark.CMARK_NODE_HTML_BLOCK, + custom_block = cmark.CMARK_NODE_CUSTOM_BLOCK, + paragraph = cmark.CMARK_NODE_PARAGRAPH, + heading = cmark.CMARK_NODE_HEADING, + thematic_break = cmark.CMARK_NODE_THEMATIC_BREAK, + + // Inline + text = cmark.CMARK_NODE_TEXT, + softbreak = cmark.CMARK_NODE_SOFTBREAK, + linebreak = cmark.CMARK_NODE_LINEBREAK, + code = cmark.CMARK_NODE_CODE, + html_inline = cmark.CMARK_NODE_HTML_INLINE, + custom_inline = cmark.CMARK_NODE_CUSTOM_INLINE, + emph = cmark.CMARK_NODE_EMPH, + strong = cmark.CMARK_NODE_STRONG, + link = cmark.CMARK_NODE_LINK, + image = cmark.CMARK_NODE_IMAGE, +}; + +pub const ListType = enum(c_int) { + no_list = cmark.CMARK_NO_LIST, + bullet_list = cmark.CMARK_BULLET_LIST, + ordered_list = cmark.CMARK_ORDERED_LIST, +}; + +// only for ordered lists +pub const DelimType = enum(c_int) { + no_delim = cmark.CMARK_NO_DELIM, + period_delim = cmark.CMARK_PERIOD_DELIM, + paren_delim = cmark.CMARK_PAREN_DELIM, +}; + +pub const NodeIteratorEvent = enum(c_int) { + none = cmark.CMARK_EVENT_NONE, + done = cmark.CMARK_EVENT_DONE, + enter = cmark.CMARK_EVENT_ENTER, + exit = cmark.CMARK_EVENT_EXIT, +}; + +pub const CmarkOptions = packed struct(u32) { + _skip_0: bool = false, // for some reason 1 << 0 is skipped (oversight?) + + include_sourcepos: bool = false, // index 1 + softbreaks_as_hardbreaks: bool = false, // index 2 + + _skip_safe: bool = false, // index 3; deprecated, no effect + + softbreaks_as_spaces: bool = false, // index 4 + + _skip_1: u3 = 0, // skip indices 5, 6 and 7 + _skip_normalize: bool = false, // index 8; deprecated, no effect, + + validate_utf8: bool = false, // index 9 + smart_quotes_and_dashes: bool = false, // index 10 + + _skip_2: u6 = 0, // skip indices 11, 12, 13, 14, 15, 16 + + allow_unsafe_html: bool = false, // index 17 + + _padding: u14 = 0, // no other options +}; + +comptime { + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .include_sourcepos = true })) == cmark.CMARK_OPT_SOURCEPOS); + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .softbreaks_as_hardbreaks = true })) == cmark.CMARK_OPT_HARDBREAKS); + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .softbreaks_as_spaces = true })) == cmark.CMARK_OPT_NOBREAKS); + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .validate_utf8 = true })) == cmark.CMARK_OPT_VALIDATE_UTF8); + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .smart_quotes_and_dashes = true })) == cmark.CMARK_OPT_SMART); + std.debug.assert(@as(u32, @bitCast(CmarkOptions{ .allow_unsafe_html = true })) == cmark.CMARK_OPT_UNSAFE); +} + +const AllocHeader = extern struct { + size: usize, + tip: u8, + + inline fn fromAllocatedSlice(slice: []u8) *AllocHeader { + return @ptrFromInt(@intFromPtr(slice.ptr)); + } + + inline fn fromTipPointer(tip: *u8) *AllocHeader { + return @fieldParentPtr(AllocHeader, "tip", tip); + } + + inline fn fullAllocFromTip(tip: *anyopaque) align(@alignOf(AllocHeader)) []u8 { + const hdr = fromTipPointer(@ptrCast(tip)); + const mem: [*]u8 = @ptrFromInt(@intFromPtr(hdr)); + return mem[0..(@sizeOf(usize) + hdr.size)]; + } +}; + +fn cmarkCalloc(ctx: ?*anyopaque, size: usize, count: usize) callconv(.C) ?*anyopaque { + const allocator: *const std.mem.Allocator = @ptrCast(@alignCast(ctx orelse return null)); + const mem_size = size * count; + const raw_mem = allocator.alignedAlloc( + u8, + @alignOf(AllocHeader), + @sizeOf(usize) + mem_size, + ) catch return null; + + // cmark does rely on the allocated memory being zeroed, unfortunately. The Zig + // allocator interface always fills with undefined, and this is not configurable. + @memset(raw_mem, 0); + + const header = AllocHeader.fromAllocatedSlice(raw_mem); + header.size = mem_size; + return &header.tip; +} + +fn cmarkRealloc(ctx: ?*anyopaque, mem: ?*anyopaque, new_size: usize) callconv(.C) ?*anyopaque { + const allocator: *const std.mem.Allocator = @ptrCast(@alignCast(ctx orelse return null)); + if (mem == null) { + return cmarkCalloc(ctx, 1, new_size); + } + + const raw_mem = allocator.realloc(AllocHeader.fullAllocFromTip(mem.?), new_size) catch + return null; + + const header = AllocHeader.fromAllocatedSlice(raw_mem); + header.size = raw_mem.len; + return &header.tip; +} + +fn cmarkFree(ctx: ?*anyopaque, mem: ?*anyopaque) callconv(.C) void { + const allocator: *const std.mem.Allocator = @ptrCast(@alignCast(ctx orelse return)); + if (mem == null) return; + + const raw_mem = AllocHeader.fullAllocFromTip(mem.?); + allocator.free(raw_mem); +} + +fn wrapCmarkAllocator(allocator: *const std.mem.Allocator) cmark.cmark_mem { + return .{ + .ctx = @constCast(allocator), + .calloc = cmarkCalloc, + .realloc = cmarkRealloc, + .free = cmarkFree, + }; +} + +const CmarkNode = union(enum) { + document: *CmarkOpaqueNode, + heading: *CmarkHeadingNode, + block_quote: *CmarkOpaqueNode, + + bullet_list: *CmarkOpaqueNode, + ordered_list: *CmarkOrderedListNode, + item: *CmarkOpaqueNode, + + code_block: *CmarkOpaqueNode, + html_block: *CmarkOpaqueNode, + custom_block: *CmarkOpaqueNode, + thematic_break: *CmarkOpaqueNode, + + paragraph: *CmarkOpaqueNode, + text: *CmarkOpaqueNode, + softbreak: *CmarkOpaqueNode, + linebreak: *CmarkOpaqueNode, + + html_inline: *CmarkOpaqueNode, + custom_inline: *CmarkOpaqueNode, + + code: *CmarkOpaqueNode, + emph: *CmarkOpaqueNode, + strong: *CmarkOpaqueNode, + + link: *CmarkOpaqueNode, + image: *CmarkOpaqueNode, + + fn fromCNode(c_node: ?*cmark.cmark_node) !CmarkNode { + const node = c_node orelse return error.Failed; + + switch (@as(NodeType, @enumFromInt(cmark.cmark_node_get_type(@ptrCast(node))))) { + .none => @panic("none? none??????"), + .heading => return .{ .heading = @ptrCast(node) }, + .document => return .{ .document = @ptrCast(node) }, + .block_quote => return .{ .block_quote = @ptrCast(node) }, + .list => { + if (cmark.cmark_node_get_list_type(node) == cmark.CMARK_BULLET_LIST) + return .{ .bullet_list = @ptrCast(node) } + else + return .{ .ordered_list = @ptrCast(node) }; + }, + .item => return .{ .item = @ptrCast(node) }, + .code_block => return .{ .code_block = @ptrCast(node) }, + .html_block => return .{ .html_block = @ptrCast(node) }, + .custom_block => return .{ .custom_block = @ptrCast(node) }, + .paragraph => return .{ .paragraph = @ptrCast(node) }, + .thematic_break => return .{ .thematic_break = @ptrCast(node) }, + .text => return .{ .text = @ptrCast(node) }, + .softbreak => return .{ .softbreak = @ptrCast(node) }, + .linebreak => return .{ .linebreak = @ptrCast(node) }, + .code => return .{ .code = @ptrCast(node) }, + .html_inline => return .{ .html_inline = @ptrCast(node) }, + .custom_inline => return .{ .custom_inline = @ptrCast(node) }, + .emph => return .{ .emph = @ptrCast(node) }, + .strong => return .{ .strong = @ptrCast(node) }, + .link => return .{ .link = @ptrCast(node) }, + .image => return .{ .image = @ptrCast(node) }, + } + } +}; + +pub fn CmarkNodeCommon(comptime Self: type) type { + return struct { + pub fn getUserData(self: *Self) ?*anyopaque { + return @ptrCast(cmark.cmark_node_get_user_data(@ptrCast(self))); + } + + pub fn setUserData(self: *Self, user_data: ?*anyopaque) bool { + return cmark.cmark_node_set_user_data(@ptrCast(self), user_data) == 1; + } + }; +} + +pub fn CmarkBlockNodeContents(comptime Self: type) type { + return struct { + pub fn getContent(self: *Self) [:0]const u8 { + return cmark.cmark_node_get_literal(@ptrCast(self)); + } + + pub fn setContent(self: *Self, new: [:0]const u8) bool { + return cmark.cmark_node_set_literal(@ptrCast(self), new.ptr); + } + }; +} + +pub const CmarkOpaqueNode = opaque { + pub usingnamespace CmarkNodeCommon(@This()); +}; + +pub const CmarkHeadingNode = opaque { + pub fn getLevel(self: *CmarkHeadingNode) i3 { + return @intCast(cmark.cmark_node_get_heading_level(@ptrCast(self))); + } + + pub fn setLevel(self: *CmarkHeadingNode, level: i3) !void { + if (cmark.cmark_node_set_heading_level(@ptrCast(self), level) != 1) + return error.Failed; + } + + pub usingnamespace CmarkNodeCommon(@This()); +}; + +pub const CmarkOrderedListNode = opaque { + pub fn getDelimeter(self: *CmarkOrderedListNode) DelimType { + return @enumFromInt(cmark.cmark_node_get_list_delim(@ptrCast(self))); + } + + pub fn setDelimiter(self: *CmarkOrderedListNode, new: DelimType) !void { + if (cmark.cmark_node_set_list_delim(@ptrCast(self), @intFromEnum(new)) != 1) + return error.Failed; + } + + pub fn getStart(self: *CmarkOrderedListNode) i32 { + return @intCast(cmark.cmark_node_get_list_start(@ptrCast(self))); + } + + pub fn setStart(self: *CmarkOrderedListNode, start: i32) !void { + if (cmark.cmark_node_get_list_start(@ptrCast(self), @intCast(start)) != 1) + return error.Failed; + } + + pub fn getTight(self: *CmarkOrderedListNode) bool { + return cmark.cmark_node_get_list_start(@ptrCast(self)) == 1; + } + + pub fn setTight(self: *CmarkOrderedListNode, tight: bool) !void { + if (cmark.cmark_node_get_list_start(@ptrCast(self), @intFromBool(tight)) != 1) + return error.Failed; + } + + pub usingnamespace CmarkNodeCommon(@This()); +}; + +pub const CmarkCodeBlockNode = opaque { + pub fn getFenceInfo(self: *CmarkHeadingNode) [:0]const u8 { + const str: [*:0]const u8 = cmark.cmark_node_get_fence_info(@ptrCast(self)) orelse + return error.Failed; + + return std.mem.sliceTo(str, 0); + } + + pub fn setFenceInfo(self: *CmarkHeadingNode, info: [:0]const u8) !void { + if (cmark.cmark_node_set_fence_info(@ptrCast(self), info.ptr) != 1) + return error.Failed; + } + + pub usingnamespace CmarkNodeCommon(@This()); +}; + +pub const Parser = struct { + allocator: *const std.mem.Allocator, + _cmark_mem: *cmark.cmark_mem, + _parser: *cmark.cmark_parser, + + pub fn new(allocator: *const std.mem.Allocator, options: CmarkOptions) !Parser { + var self: Parser = .{ + .allocator = allocator, + ._cmark_mem = undefined, + ._parser = undefined, + }; + + self._cmark_mem = try allocator.create(cmark.cmark_mem); + self._cmark_mem.* = wrapCmarkAllocator(self.allocator); + + self._parser = cmark.cmark_parser_new_with_mem( + @bitCast(options), + self._cmark_mem, + ) orelse return error.OutOfMemory; + + return self; + } + + pub fn feed(self: Parser, buffer: []const u8) void { + cmark.cmark_parser_feed(self._parser, buffer.ptr, buffer.len); + } + + pub fn finish(self: Parser) CmarkNode { + return CmarkNode.fromCNode(cmark.cmark_parser_finish(self._parser)); + } + + pub fn deinit(self: Parser) void { + cmark.cmark_parser_free(self._parser); + self.allocator.destroy(self._cmark_mem); + } +}; + +// pub fn parse(buffer: []const u8, options: CmarkOptions) !CmarkNode +// pub fn parseFile(path: []const u8, options: CmarkOptions) !CmarkNode + +pub fn main() void { + const a = std.heap.page_allocator; + const parser = Parser.new(&a, .{}) catch @panic("noop"); + defer parser.deinit(); +}