diff --git a/src/browser/css/Tokenizer.zig b/src/browser/css/Tokenizer.zig new file mode 100644 index 000000000..4a6b141cc --- /dev/null +++ b/src/browser/css/Tokenizer.zig @@ -0,0 +1,820 @@ +// Copyright (C) 2023-2025 Lightpanda (Selecy SAS) +// +// Francis Bouvier +// Pierre Tachoire +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! This file implements the tokenization step defined in the CSS Syntax Module Level 3 specification. +//! +//! The algorithm accepts a valid UTF-8 string and returns a stream of tokens. +//! The tokenization step never fails, even for complete gibberish. +//! Validity must then be checked by the parser. +//! +//! NOTE: The tokenizer is not thread-safe and does not own any memory, and does not check the validity of utf8. +//! +//! See spec for more info: https://drafts.csswg.org/css-syntax/#tokenization + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; + +const Tokenizer = @This(); + +pub const Token = union(enum) { + /// A `` + ident: []const u8, + + /// A `` + /// + /// The value (name) does not include the `(` marker. + function: []const u8, + + /// A `` + /// + /// The value does not include the `@` marker. + at_keyword: []const u8, + + /// A `` with the type flag set to "id" + /// + /// The value does not include the `#` marker. + id_hash: []const u8, // Hash that is a valid ID selector. + + /// A `` with the type flag set to "unrestricted" + /// + /// The value does not include the `#` marker. + unrestricted_hash: []const u8, + + /// A `` + /// + /// The value does not include the quotes. + string: []const u8, + + /// A `` + /// + /// This token always indicates a parse error. + bad_string: []const u8, + + /// A `` + /// + /// The value does not include the `url(` `)` markers. Note that `url( )` is represented by a + /// `Function` token. + url: []const u8, + + /// A `` + /// + /// This token always indicates a parse error. + bad_url: []const u8, + + /// A `` + delim: u8, + + /// A `` + number: struct { + /// Whether the number had a `+` or `-` sign. + /// + /// This is used is some cases like the micro syntax. (See the `parse_nth` function.) + has_sign: bool, + + /// If the origin source did not include a fractional part, the value as an integer. + int_value: ?i32, + + /// The value as a float + value: f32, + }, + + /// A `` + percentage: struct { + /// Whether the number had a `+` or `-` sign. + has_sign: bool, + + /// If the origin source did not include a fractional part, the value as an integer. + /// It is **not** divided by 100. + int_value: ?i32, + + /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0. + unit_value: f32, + }, + + /// A `` + dimension: struct { + /// Whether the number had a `+` or `-` sign. + /// + /// This is used is some cases like the micro syntax. (See the `parse_nth` function.) + has_sign: bool, + + /// If the origin source did not include a fractional part, the value as an integer. + int_value: ?i32, + + /// The value as a float + value: f32, + + /// The unit, e.g. "px" in `12px` + unit: []const u8, + }, + + /// A `` + unicode_range: struct { bgn: u32, end: i32 }, + + /// A `` + white_space: []const u8, + + /// A `` `` + cdc, + + /// A `:` `` + colon, // : + + /// A `;` `` + semicolon, // ; + + /// A `,` `` + comma, // , + + /// A `<[-token>` + square_bracket_block, + + /// A `<]-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + close_square_bracket, + + /// A `<(-token>` + parenthesis_block, + + /// A `<)-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + close_parenthesis, + + /// A `<{-token>` + curly_bracket_block, + + /// A `<}-token>` + /// + /// When obtained from one of the `Parser::next*` methods, + /// this token is always unmatched and indicates a parse error. + close_curly_bracket, + + /// A comment. + /// + /// The CSS Syntax spec does not generate tokens for comments, + /// But we do for simplicity of the interface. + /// + /// The value does not include the `/*` `*/` markers. + comment: []const u8, +}; + +input: []const u8, + +/// Counted in bytes, not code points. From 0. +position: usize = 0, + +// If true, the input has at least `n` bytes left *after* the current one. +// That is, `Lexer.byteAt(n)` will not panic. +fn hasAtLeast(self: *const Tokenizer, n: usize) bool { + return self.position + n < self.input.len; +} + +fn isEof(self: *const Tokenizer) bool { + return !self.hasAtLeast(0); +} + +fn byteAt(self: *const Tokenizer, offset: usize) u8 { + return self.input[self.position + offset]; +} + +// Assumes non-EOF +fn nextByteUnchecked(self: *const Tokenizer) u8 { + return self.byteAt(0); +} + +fn nextByte(self: *const Tokenizer) ?u8 { + return if (self.isEof()) + null + else + self.input[self.position]; +} + +fn startsWith(self: *const Tokenizer, needle: []const u8) bool { + return std.mem.startsWith(u8, self.input[self.position..], needle); +} + +fn slice(self: *const Tokenizer, start: usize, end: usize) []const u8 { + return self.input[start..end]; +} + +fn sliceFrom(self: *const Tokenizer, start_pos: usize) []const u8 { + return self.slice(start_pos, self.position); +} + +// Advance over N bytes in the input. This function can advance +// over ASCII bytes (excluding newlines), or UTF-8 sequence +// leaders (excluding leaders for 4-byte sequences). +fn advance(self: *Tokenizer, n: usize) void { + if (builtin.mode == .Debug) { + // Each byte must either be an ASCII byte or a sequence leader, + // but not a 4-byte leader; also newlines are rejected. + for (0..n) |i| { + const b = self.byteAt(i); + assert(b != '\r' and b != '\n' and b != '\x0C'); + assert(b <= 0x7F or (b & 0xF0 != 0xF0 and b & 0xC0 != 0x80)); + } + } + self.position += n; +} + +fn hasNewlineAt(self: *const Tokenizer, offset: usize) bool { + if (!self.hasAtLeast(offset)) return false; + + return switch (self.byteAt(offset)) { + '\n', '\r', '\x0C' => true, + else => false, + }; +} + +fn hasNonAsciiAt(self: *const Tokenizer, offset: usize) bool { + if (!self.hasAtLeast(offset)) return false; + + const byte = self.byteAt(offset); + const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch return false; + + if (!self.hasAtLeast(offset + len_utf8 - 1)) return false; + + const start = self.position + offset; + const bytes = self.slice(start, start + len_utf8); + + const codepoint = std.unicode.utf8Decode(bytes) catch return false; + + // https://drafts.csswg.org/css-syntax/#non-ascii-ident-code-point + return switch (codepoint) { + '\u{00B7}', '\u{200C}', '\u{200D}', '\u{203F}', '\u{2040}' => true, + '\u{00C0}'...'\u{00D6}' => true, + '\u{00D8}'...'\u{00F6}' => true, + '\u{00F8}'...'\u{037D}' => true, + '\u{037F}'...'\u{1FFF}' => true, + '\u{2070}'...'\u{218F}' => true, + '\u{2C00}'...'\u{2FEF}' => true, + '\u{3001}'...'\u{D7FF}' => true, + '\u{F900}'...'\u{FDCF}' => true, + '\u{FDF0}'...'\u{FFFD}' => true, + else => codepoint >= '\u{10000}', + }; +} + +fn isIdentStart(self: *Tokenizer) bool { + if (self.isEof()) return false; + + var b = self.nextByteUnchecked(); + if (b == '-') { + b = self.nextByte() orelse return false; + } + + return switch (b) { + 'a'...'z', 'A'...'Z', '_', 0x0 => true, + '\\' => !self.hasNewlineAt(1), + else => b > 0x7F, // not is ascii + }; +} + +fn consumeChar(self: *Tokenizer) void { + const byte = self.nextByteUnchecked(); + const len_utf8 = std.unicode.utf8ByteSequenceLength(byte) catch 1; + self.position += len_utf8; +} + +// Given that a newline has been seen, advance over the newline +// and update the state. +fn consumeNewline(self: *Tokenizer) void { + const byte = self.nextByteUnchecked(); + assert(byte == '\r' or byte == '\n' or byte == '\x0C'); + + self.position += 1; + if (byte == '\r' and self.nextByte() == '\n') { + self.position += 1; + } +} + +fn consumeWhiteSpace(self: *Tokenizer, newline: bool) Token { + const start_position = self.position; + if (newline) { + self.consumeNewline(); + } else { + self.advance(1); + } + while (!self.isEof()) { + const b = self.nextByteUnchecked(); + switch (b) { + ' ', '\t' => { + self.advance(1); + }, + '\n', '\x0C', '\r' => { + self.consumeNewline(); + }, + else => break, + } + } + return .{ .white_space = self.sliceFrom(start_position) }; +} + +fn consumeComment(self: *Tokenizer) []const u8 { + self.advance(2); // consume "/*" + const start_position = self.position; + while (!self.isEof()) { + switch (self.nextByteUnchecked()) { + '*' => { + const end_position = self.position; + self.advance(1); + if (self.nextByte() == '/') { + self.advance(1); + return self.slice(start_position, end_position); + } + }, + '\n', '\x0C', '\r' => { + self.consumeNewline(); + }, + 0x0 => self.advance(1), + else => self.consumeChar(), + } + } + return self.sliceFrom(start_position); +} + +fn byteToHexDigit(b: u8) ?u32 { + return switch (b) { + '0'...'9' => b - '0', + 'a'...'f' => b - 'a' + 10, + 'A'...'F' => b - 'A' + 10, + else => null, + }; +} + +fn byteToDecimalDigit(b: u8) ?u32 { + return if (std.ascii.isDigit(b)) b - '0' else null; +} + +// (value, number of digits up to 6) +fn consumeHexDigits(self: *Tokenizer) void { + var value: u32 = 0; + var digits: u32 = 0; + + while (digits < 6 and !self.isEof()) { + if (byteToHexDigit(self.nextByteUnchecked())) |digit| { + value = value * 16 + digit; + digits += 1; + self.advance(1); + } else { + break; + } + } + + _ = &value; +} + +// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed +// and that the next input character has already been verified +// to not be a newline. +fn consumeEscape(self: *Tokenizer) void { + if (self.isEof()) + return; // Escaped EOF + + switch (self.nextByteUnchecked()) { + '0'...'9', 'A'...'F', 'a'...'f' => { + consumeHexDigits(self); + + if (!self.isEof()) { + switch (self.nextByteUnchecked()) { + ' ', '\t' => { + self.advance(1); + }, + '\n', '\x0C', '\r' => { + self.consumeNewline(); + }, + else => {}, + } + } + }, + else => self.consumeChar(), + } +} + +/// https://drafts.csswg.org/css-syntax/#consume-string-token +fn consumeString(self: *Tokenizer, single_quote: bool) Token { + self.advance(1); // Skip the initial quote + + // start_pos is at code point boundary, after " or ' + const start_pos = self.position; + + while (!self.isEof()) { + switch (self.nextByteUnchecked()) { + '"' => { + if (!single_quote) { + const value = self.sliceFrom(start_pos); + self.advance(1); + return .{ .string = value }; + } + self.advance(1); + }, + '\'' => { + if (single_quote) { + const value = self.sliceFrom(start_pos); + self.advance(1); + return .{ .string = value }; + } + self.advance(1); + }, + '\n', '\r', '\x0C' => { + return .{ .bad_string = self.sliceFrom(start_pos) }; + }, + '\\' => { + self.advance(1); + if (self.isEof()) + continue; // escaped EOF, do nothing. + + switch (self.nextByteUnchecked()) { + // Escaped newline + '\n', '\x0C', '\r' => self.consumeNewline(), + + // Spec calls for replacing escape sequences with characters, + // but this would require allocating a new string. + // Therefore, we leave it as is and let the parser handle the escaping. + else => self.consumeEscape(), + } + }, + else => self.consumeChar(), + } + } + + return .{ .string = self.sliceFrom(start_pos) }; +} + +fn consumeName(self: *Tokenizer) []const u8 { + // start_pos is the end of the previous token, therefore at a code point boundary + const start_pos = self.position; + + while (!self.isEof()) { + switch (self.nextByteUnchecked()) { + 'a'...'z', 'A'...'Z', '0'...'9', '_', '-' => self.advance(1), + '\\' => { + if (self.hasNewlineAt(1)) { + break; + } + + self.advance(1); + self.consumeEscape(); + }, + 0x0 => self.advance(1), + '\x80'...'\xBF', '\xC0'...'\xEF', '\xF0'...'\xFF' => { + // This byte *is* part of a multi-byte code point, + // we’ll end up copying the whole code point before this loop does something else. + self.advance(1); + }, + else => { + if (self.hasNonAsciiAt(0)) { + self.consumeChar(); + } else { + break; // ASCII + } + }, + } + } + + return self.sliceFrom(start_pos); +} + +fn consumeMark(self: *Tokenizer) Token { + const byte = self.nextByteUnchecked(); + self.advance(1); + return switch (byte) { + ',' => .comma, + ':' => .colon, + ';' => .semicolon, + '(' => .parenthesis_block, + ')' => .close_parenthesis, + '{' => .curly_bracket_block, + '}' => .close_curly_bracket, + '[' => .square_bracket_block, + ']' => .close_square_bracket, + else => unreachable, + }; +} + +fn consumeNumeric(self: *Tokenizer) Token { + // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)? + // But this is always called so that there is at least one digit in \d*(\.\d+)? + + // Do all the math in f64 so that large numbers overflow to +/-inf + // and i32::{MIN, MAX} are within range. + + var sign: f64 = 1.0; + var has_sign = false; + switch (self.nextByteUnchecked()) { + '+' => { + has_sign = true; + }, + '-' => { + has_sign = true; + sign = -1.0; + }, + else => {}, + } + if (has_sign) { + self.advance(1); + } + + var is_integer = true; + var integral_part: f64 = 0.0; + var fractional_part: f64 = 0.0; + + while (!self.isEof()) { + if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| { + integral_part = integral_part * 10.0 + @as(f64, @floatFromInt(digit)); + self.advance(1); + } else { + break; + } + } + + if (self.hasAtLeast(1) and self.nextByteUnchecked() == '.' and std.ascii.isDigit(self.byteAt(1))) { + is_integer = false; + self.advance(1); // Consume '.' + + var factor: f64 = 0.1; + while (!self.isEof()) { + if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| { + fractional_part += @as(f64, @floatFromInt(digit)) * factor; + factor *= 0.1; + self.advance(1); + } else { + break; + } + } + } + + var value = sign * (integral_part + fractional_part); + + blk: { + const e = self.nextByte() orelse break :blk; + if (e != 'e' and e != 'E') break :blk; + + var mul: f64 = 1.0; + + if (self.hasAtLeast(2) and (self.byteAt(1) == '+' or self.byteAt(1) == '-') and std.ascii.isDigit(self.byteAt(2))) { + mul = switch (self.byteAt(1)) { + '-' => -1.0, + '+' => 1.0, + else => unreachable, + }; + + self.advance(2); + } else if (self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(2))) { + self.advance(1); + } else { + break :blk; + } + + is_integer = false; + + var exponent: f64 = 0.0; + while (!self.isEof()) { + if (byteToDecimalDigit(self.nextByteUnchecked())) |digit| { + exponent = exponent * 10.0 + @as(f64, @floatFromInt(digit)); + self.advance(1); + } else { + break; + } + } + value *= std.math.pow(f64, 10.0, mul * exponent); + } + + const int_value: ?i32 = if (is_integer) blk: { + if (value >= std.math.maxInt(i32)) { + break :blk std.math.maxInt(i32); + } + + if (value <= std.math.minInt(i32)) { + break :blk std.math.minInt(i32); + } + + break :blk @as(i32, @intFromFloat(value)); + } else null; + + if (!self.isEof() and self.nextByteUnchecked() == '%') { + self.advance(1); + + return .{ .percentage = .{ + .has_sign = has_sign, + .int_value = int_value, + .unit_value = @as(f32, @floatCast(value / 100.0)), + } }; + } + + if (isIdentStart(self)) { + return .{ .dimension = .{ + .has_sign = has_sign, + .int_value = int_value, + .value = @as(f32, @floatCast(value)), + .unit = consumeName(self), + } }; + } + + return .{ .number = .{ + .has_sign = has_sign, + .int_value = int_value, + .value = @as(f32, @floatCast(value)), + } }; +} + +fn consumeUnquotedUrl(self: *Tokenizer) ?Token { + _ = self; + @panic("unimplemented"); +} + +fn consumeIdentLike(self: *Tokenizer) Token { + const value = self.consumeName(); + + if (!self.isEof() and self.nextByteUnchecked() == '(') { + self.advance(1); + + if (std.ascii.eqlIgnoreCase(value, "url")) { + if (self.consumeUnquotedUrl()) |result| { + return result; + } + } + + return .{ .function = value }; + } + + return .{ .ident = value }; +} + +pub fn next(self: *Tokenizer) ?Token { + if (self.isEof()) { + return null; + } + + const b = self.nextByteUnchecked(); + return switch (b) { + // Consume comments + '/' => { + if (self.startsWith("/*")) { + return .{ .comment = self.consumeComment() }; + } else { + self.advance(1); + return .{ .delim = '/' }; + } + }, + + // Consume marks + '(', ')', '{', '}', '[', ']', ',', ':', ';' => { + return self.consumeMark(); + }, + + // Consume as much whitespace as possible. Return a . + ' ', '\t' => self.consumeWhiteSpace(false), + '\n', '\x0C', '\r' => self.consumeWhiteSpace(true), + + // Consume a string token and return it. + '"' => self.consumeString(false), + '\'' => self.consumeString(true), + + '0'...'9' => self.consumeNumeric(), + 'a'...'z', 'A'...'Z', '_', 0x0 => self.consumeIdentLike(), + + '+' => { + if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or + (self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2)))) + { + return self.consumeNumeric(); + } + self.advance(1); + return .{ .delim = '+' }; + }, + '-' => { + if ((self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) or + (self.hasAtLeast(2) and self.byteAt(1) == '.' and std.ascii.isDigit(self.byteAt(2)))) + { + return self.consumeNumeric(); + } + + if (self.startsWith("-->")) { + self.advance(3); + return .cdc; + } + + if (isIdentStart(self)) { + return self.consumeIdentLike(); + } + + self.advance(1); + return .{ .delim = '-' }; + }, + '.' => { + if (self.hasAtLeast(1) and std.ascii.isDigit(self.byteAt(1))) { + return self.consumeNumeric(); + } + self.advance(1); + return .{ .delim = '.' }; + }, + + // Consume hash token + '#' => { + self.advance(1); + if (self.isIdentStart()) { + return .{ .id_hash = self.consumeName() }; + } + if (self.nextByte()) |it| { + switch (it) { + // Any other valid case here already resulted in IDHash. + '0'...'9', '-' => return .{ .unrestricted_hash = self.consumeName() }, + else => {}, + } + } + return .{ .delim = '#' }; + }, + + // Consume at-rules + '@' => { + self.advance(1); + return if (isIdentStart(self)) + .{ .at_keyword = consumeName(self) } + else + .{ .delim = '@' }; + }, + + '<' => { + if (self.startsWith("