vercel/next.js / source_pos.rs

source_pos.rs144 lines4.6 KB
use bincode::{Decode, Encode};
use serde::Serialize;
use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
use turbo_tasks_hash::DeterministicHash;

/// LINE FEED (LF), one of the basic JS line terminators.
const U8_LF: u8 = 0x0A;
/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
const U8_CR: u8 = 0x0D;

#[derive(
    Default,
    Debug,
    PartialEq,
    Eq,
    Copy,
    Clone,
    Hash,
    PartialOrd,
    Ord,
    TaskInput,
    TraceRawVcs,
    Serialize,
    DeterministicHash,
    NonLocalValue,
    Encode,
    Decode,
)]
pub struct SourcePos {
    /// The line, 0-indexed.
    pub line: u32,
    /// The byte index of the column, 0-indexed.
    pub column: u32,
}

impl SourcePos {
    pub fn new(start_line: u32) -> Self {
        Self {
            line: start_line,
            column: 0,
        }
    }

    pub fn max() -> Self {
        Self {
            line: u32::MAX,
            column: u32::MAX,
        }
    }

    /// Increments the line/column position to account for new source code.
    /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
    /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
    ///
    /// See <https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators>
    pub fn update(&mut self, code: &[u8]) {
        // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
        // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
        // should count as a 1 char and not 2.
        let &mut SourcePos {
            mut line,
            mut column,
        } = self;

        let mut i = 0;
        while i < code.len() {
            // This is not a UTF-8 validator, but it's likely close enough. It's assumed
            // that the input is valid (and if it isn't than what are you doing trying to
            // embed it into source code anyways?). The important part is that we process in
            // order, and use the first octet's bit pattern to decode the octet length of
            // the char.
            match code[i] {
                U8_LF => {
                    i += 1;
                    line += 1;
                    column = 0;
                }
                U8_CR => {
                    // Count "\r\n" as a single terminator.
                    if code.get(i + 1) == Some(&U8_LF) {
                        i += 2;
                    } else {
                        i += 1;
                    }
                    line += 1;
                    column = 0;
                }

                // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
                // just a regular ASCII.
                b if b & 0b10000000 == 0 => {
                    i += 1;
                    column += 1;
                }

                // 2 octet chars have a leading `110` bit pattern. None are considered line
                // terminators.
                b if b & 0b11100000 == 0b11000000 => {
                    // eat this byte and the next.
                    i += 2;
                    column += 1;
                }

                // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
                // SEPARATOR exist in 3 octets.
                b if b & 0b11110000 == 0b11100000 => {
                    // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
                    // denoting either line or paragraph.
                    let mut separator = false;
                    if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
                        let last = code.get(i + 2).cloned().unwrap_or_default();
                        separator = (last & 0b11111110) == 0b10101000
                    }

                    // eat this byte and the next 2.
                    i += 3;
                    if separator {
                        line += 1;
                        column = 0;
                    } else {
                        column += 1;
                    }
                }

                // 4 octet chars have a leading `11110` pattern, but we don't need to check because
                // none of the other patterns matched.
                _ => {
                    // eat this byte and the next 3.
                    i += 4;
                    column += 1;
                }
            }
        }
        self.line = line;
        self.column = column;
    }
}

impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
    fn eq(&self, other: &(u32, u32)) -> bool {
        &(self.line, self.column) == other
    }
}