donet_core/parser/
lexer.rs

1/*
2    This file is part of Donet.
3
4    Copyright © 2024-2025 Max Rodriguez <[email protected]>
5
6    Donet is free software; you can redistribute it and/or modify
7    it under the terms of the GNU Affero General Public License,
8    as published by the Free Software Foundation, either version 3
9    of the License, or (at your option) any later version.
10
11    Donet is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14    GNU Affero General Public License for more details.
15
16    You should have received a copy of the GNU Affero General Public
17    License along with Donet. If not, see <https://www.gnu.org/licenses/>.
18*/
19
20//! Definition of the Lexer machine to process raw DC file
21//! string data into a stream of lexical tokens for the DC parser.
22
23use crate::globals::{DC_VIEW_SUFFIXES, HISTORICAL_DC_KEYWORDS};
24use plex::lexer;
25
26#[rustfmt::skip]
27#[derive(Debug, Clone, PartialEq)]
28pub enum DCToken {
29    Whitespace,       // Not a DC token; Ignores: " " | tab | carriage-return
30    Comment,          // Not a DC token; Ignored. Satisfies lexer match.
31    Newline,          // Not a DC token; Used by lexer iterator to keep track of line #.
32    // Letter   ::= "A" ... "z"
33    // DecDigit ::= "0" ... "9"
34    // OctDigit ::= "0" ... "7"
35    // HexDigit ::= "0" ... "9" | "A" ... "F" | "a" ... "f"
36    // BinDigit ::= "0" | "1"
37
38    // Integers
39    DecimalLiteral(i64),   // ( "1" … "9" ) { DecDigit }
40    OctalLiteral(String),  // "0" { OctDigit }
41    HexLiteral(String),    // "0" ( "x" | "X" ) HexDigit { HexDigit }
42    BinaryLiteral(String), // "0" ( "b" | "B" ) BinDigit { BinDigit }
43
44    // IntegerLiteral ::= DecimalLiteral | OctalLiteral | HexLiteral | BinaryLiteral
45    // NumberLiteral  ::= IntegerLiteral | FloatLiteral
46    // decimals       ::= DecDigit { DecDigit }
47
48    // Floats
49    FloatLiteral(f64), // decimals "." [ decimals ] | "." [ decimals ]
50
51    // Text Literals
52    CharacterLiteral(char),
53    StringLiteral(String),
54    // nonSingleQuote  ::= <any printable character except "'" or newline>
55    // nonDoubleQuote  ::= <any printable character except `"` or newline>
56    EscapeCharacter(String), // "\" ( <any character> | "x" hexDigit { hexDigit } )
57
58    // Data Types
59    CharT,             // "char"
60    Int8T,             // "int8"
61    Int16T,            // "int16"
62    Int32T,            // "int32"
63    Int64T,            // "int64"
64    UInt8T,            // "uint8"
65    UInt16T,           // "uint16"
66    UInt32T,           // "uint32"
67    UInt64T,           // "uint64"
68    Float32T,          // "float32"
69    Float64T,          // "float64"
70    StringT,           // "string"
71    BlobT,             // "blob"
72
73    // Keywords
74    DClass,  // "dclass"
75    Struct,  // "struct"
76    Keyword, // "keyword"
77    Typedef, // "typedef"
78    From,    // "from"
79    Import,  // "import"
80    Switch,  // "switch"
81    Case,    // "case"
82    Default, // "default"
83    Break,   // "break"
84
85    Identifier(String), // ( Letter | "_" ) { Letter | DecDigit | "_" }
86    DCKeyword(String),  // ( "ram" | "required" | "db" | "airecv" | "ownrecv" |
87                        //   "clrecv" | "broadcast" | "ownsend" | "clsend" )
88    ViewSuffix(String), // ( "AI", "OV", "UD" )
89
90    // Operators
91    Percent,      // "%"
92    Star,         // "*"
93    Plus,         // "+"
94    Hyphen,       // "-"
95    ForwardSlash, // "/"
96    Period,       // "."
97
98    // Delimiters
99    OpenParenthesis,  // "("
100    CloseParenthesis, // ")"
101    OpenBraces,       // "{"
102    CloseBraces,      // "}"
103    OpenBrackets,     // "["
104    CloseBrackets,    // "]"
105    Comma,            // ","
106    Semicolon,        // ";"
107    Equals,           // "="
108    Colon,            // ":"
109}
110
111lexer! {
112    fn next_token(text: 'a) -> (DCToken, &'a str);
113
114    r#"[ \t\r]+"# => (DCToken::Whitespace, text),
115    // C++-style comments '// ...'
116    r#"//[^\n]*"# => (DCToken::Comment, text),
117    // C-style comments '/* ... */'; cannot contain '*/'
118    r#"/[*](~(.*[*]/.*))[*]/"# => (DCToken::Comment, text),
119    r#"\n"# => (DCToken::Newline, text),
120
121    r#"0|([1-9][0-9]*)"# => (DCToken::DecimalLiteral(match text.parse::<i64>() {
122        Ok(n) => { n },
123        Err(err) => {
124            panic!("dclexer: Found DecimalLiteral token, but failed to parse as i64.\n\n{}", err);
125        },
126    }), text),
127
128    r#"0[0-7]+"# => (DCToken::OctalLiteral(text.to_owned()), text),
129    r#"0[xX][0-9a-fA-F]+"# => (DCToken::HexLiteral(text.to_owned()), text),
130    r#"0[bB][0-1]+"# => (DCToken::BinaryLiteral(text.to_owned()), text),
131
132    r#"([0-9]?)+\.[0-9]+"# => (DCToken::FloatLiteral(match text.parse::<f64>() {
133        Ok(f) => { f },
134        Err(err) => {
135            panic!("dclexer: Found FloatLiteral token, but failed to parse as f64.\n\n{}", err);
136        }
137    }), text),
138
139    // Rust doesn't support lookahead/lookbehind regex, so for character literals
140    // we match the entire ''x'' and extract the second (nth(1)) character.
141    r#"'.'"# => (DCToken::CharacterLiteral(text.chars().nth(1).unwrap()), text),
142    // Note that there is no need to escape double quotes in rust regex.
143    r#""[^"]*""# => (DCToken::StringLiteral(text.to_owned().replace('\"', "")), text),
144
145    // Signed/unsigned integer data types *could* be a single token,
146    // but parsing is easier if they are all individual lexical tokens.
147    r#"char"# => (DCToken::CharT, text),
148    r#"int8"# => (DCToken::Int8T, text),
149    r#"int16"# => (DCToken::Int16T, text),
150    r#"int32"# => (DCToken::Int32T, text),
151    r#"int64"# => (DCToken::Int64T, text),
152    r#"uint8"# => (DCToken::UInt8T, text),
153    r#"uint16"# => (DCToken::UInt16T, text),
154    r#"uint32"# => (DCToken::UInt32T, text),
155    r#"uint64"# => (DCToken::UInt64T, text),
156    r#"float32"# => (DCToken::Float32T, text),
157    r#"float64"# => (DCToken::Float64T, text),
158    r#"string"# => (DCToken::StringT, text),
159    r#"blob"# => (DCToken::BlobT, text),
160
161    r#"dclass"# => (DCToken::DClass, text),
162    r#"struct"# => (DCToken::Struct, text),
163    r#"keyword"# => (DCToken::Keyword, text),
164    r#"from"# => (DCToken::From, text),
165    r#"import"# => (DCToken::Import, text),
166    r#"typedef"# => (DCToken::Typedef, text),
167    r#"switch"# => (DCToken::Switch, text),
168    r#"case"# => (DCToken::Case, text),
169    r#"default"# => (DCToken::Default, text),
170    r#"break"# => (DCToken::Break, text),
171
172    r#"[a-zA-Z_][a-zA-Z0-9_]*"# => {
173        // Decide whether this is an identifier, keyword, or view suffix.
174        if HISTORICAL_DC_KEYWORDS.contains(&text) {
175            (DCToken::DCKeyword(text.to_owned()), text)
176        } else if DC_VIEW_SUFFIXES.contains(&text) {
177            (DCToken::ViewSuffix(text.to_owned()), text)
178        } else {
179            (DCToken::Identifier(text.to_owned()), text)
180        }
181    },
182    r#"\\(x[0-9a-fA-F]+|.)"# => (DCToken::EscapeCharacter(text.to_owned()), text),
183
184    r#"%"# => (DCToken::Percent, text),
185    r#"\*"# => (DCToken::Star, text),
186    r#"\+"# => (DCToken::Plus, text),
187    r#"-"# => (DCToken::Hyphen, text),
188    r#"/"# => (DCToken::ForwardSlash, text),
189    r#"\."# => (DCToken::Period, text),
190
191    r#"\("# => (DCToken::OpenParenthesis, text),
192    r#"\)"# => (DCToken::CloseParenthesis, text),
193    r#"\{"# => (DCToken::OpenBraces, text),
194    r#"\}"# => (DCToken::CloseBraces, text),
195    r#"\["# => (DCToken::OpenBrackets, text),
196    r#"\]"# => (DCToken::CloseBrackets, text),
197    r#"\,"# => (DCToken::Comma, text),
198    r#"\;"# => (DCToken::Semicolon, text),
199    r#"\="# => (DCToken::Equals, text),
200    r#"\:"# => (DCToken::Colon, text),
201    r#"."# => {
202        panic!("dclexer: Found an unexpected character: '{}'", text);
203    }
204}
205
206pub struct Lexer<'a> {
207    original: &'a str,
208    remaining: &'a str,
209    line: usize,
210}
211
212impl<'a> Lexer<'a> {
213    pub fn new(s: &'a str) -> Lexer<'a> {
214        Lexer {
215            original: s,
216            remaining: s,
217            line: 1,
218        }
219    }
220}
221
222#[derive(Debug, PartialEq, Clone, Copy)]
223pub struct Span {
224    pub min: usize,
225    pub max: usize,
226    pub line: usize,
227}
228
229impl std::fmt::Display for Span {
230    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
231        writeln!(f, "--- SPAN ---")?;
232        write!(f, "line: ")?;
233        self.line.fmt(f)?;
234        write!(f, ", min: ")?;
235        self.min.fmt(f)?;
236        write!(f, ", max: ")?;
237        self.max.fmt(f)
238    }
239}
240
241fn span_in(s: &str, t: &str, l: usize) -> Span {
242    let min = s.as_ptr() as usize - t.as_ptr() as usize;
243    Span {
244        min,
245        max: min + s.len(),
246        line: l,
247    }
248}
249
250impl Iterator for Lexer<'_> {
251    type Item = (DCToken, Span);
252    fn next(&mut self) -> Option<(DCToken, Span)> {
253        loop {
254            let tok: (DCToken, &str) = if let Some((tok, new_remaining)) = next_token(self.remaining) {
255                self.remaining = new_remaining;
256                tok
257            } else {
258                return None;
259            };
260            match tok {
261                (DCToken::Whitespace, _) | (DCToken::Comment, _) => {
262                    // These tokens are ignored by the lexer.
263                    continue;
264                }
265                (DCToken::Newline, _) => {
266                    self.line += 1;
267                    continue;
268                }
269                (tok, span) => {
270                    return Some((tok, span_in(span, self.original, self.line)));
271                }
272            }
273        }
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use super::{DCToken, Lexer};
280
281    // Utility for unit testing lexer. Gives the test_string to the lexer
282    // and compares the lexer results with the target_tokens vector given.
283    fn lexer_test_for_target(test_string: &str, target_tokens: Vec<DCToken>) {
284        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
285        let mut token_quota_reached: bool = false;
286
287        for (i, (token, _span)) in lexer.enumerate() {
288            if i >= (target_tokens.len() - 1) {
289                token_quota_reached = true;
290            }
291            if i > (target_tokens.len() - 1) {
292                panic!("Lexer returned more tokens than expected!");
293            }
294            assert_eq!(token, *target_tokens.get(i).unwrap());
295        }
296        if !token_quota_reached {
297            panic!("Did not receive all the expected tokens!");
298        }
299    }
300
301    #[test]
302    fn ignored_tokens_test() {
303        // Covers Whitespace, Comment (C and C++ style), and Newline
304        let test_string: String = String::from(
305            "// Single line comment\n\
306            /* multiline comment*/\n\
307            \n    \n",
308        );
309        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
310
311        for (_token, _span) in lexer {
312            panic!("No tokens should have been returned by the lexer!");
313        }
314    }
315
316    #[test]
317    fn keyword_definition_test() {
318        let target: Vec<DCToken> = vec![
319            DCToken::Keyword,
320            DCToken::Identifier(String::from("test")),
321            DCToken::Semicolon,
322            // Just need to cover these two tokens for coverage.
323            DCToken::DClass,
324            DCToken::Struct,
325        ];
326        lexer_test_for_target("keyword test;\n dclass struct", target);
327    }
328
329    #[test]
330    fn switch_tokens_test() {
331        let target: Vec<DCToken> = vec![DCToken::Switch, DCToken::Case, DCToken::Default, DCToken::Break];
332        lexer_test_for_target("switch case default break", target);
333    }
334
335    #[test]
336    fn dclass_import_statement() {
337        // We will not be making use of import statements, as it is
338        // a client thing (both client and AI do this), but we still want
339        // their DC files to pass our lexer / parser without issues.
340        let target: Vec<DCToken> = vec![
341            DCToken::From,
342            DCToken::Identifier(String::from("views")),
343            DCToken::Period,
344            DCToken::Identifier(String::from("Donut")),
345            DCToken::Import,
346            DCToken::Identifier(String::from("DistributedDonut")),
347            DCToken::ForwardSlash,
348            DCToken::ViewSuffix(String::from("AI")),
349            DCToken::ForwardSlash,
350            DCToken::ViewSuffix(String::from("OV")),
351        ];
352        lexer_test_for_target("from views.Donut import DistributedDonut/AI/OV", target);
353    }
354
355    #[test]
356    fn number_literals() {
357        let target: Vec<DCToken> = vec![
358            // Decimal Literals
359            DCToken::DecimalLiteral(1),
360            DCToken::DecimalLiteral(9),
361            DCToken::DecimalLiteral(10),
362            DCToken::DecimalLiteral(2010),
363            // Octal Literals
364            DCToken::OctalLiteral(String::from("01")),
365            DCToken::OctalLiteral(String::from("07")),
366            DCToken::OctalLiteral(String::from("07472")),
367            // Hex Literals
368            DCToken::HexLiteral(String::from("0xa")),
369            DCToken::HexLiteral(String::from("0xA")),
370            DCToken::HexLiteral(String::from("0Xa")),
371            DCToken::HexLiteral(String::from("0XA")),
372            DCToken::HexLiteral(String::from("0x123456789abcdef")),
373            // Binary Literals
374            DCToken::BinaryLiteral(String::from("0b1")),
375            DCToken::BinaryLiteral(String::from("0B1")),
376            DCToken::BinaryLiteral(String::from("0b0")),
377            DCToken::BinaryLiteral(String::from("0b010")),
378            DCToken::BinaryLiteral(String::from("0b101110")),
379            // Float Literal
380            DCToken::FloatLiteral(0.0),
381            DCToken::FloatLiteral(9.0),
382            DCToken::FloatLiteral(0.0),
383            DCToken::FloatLiteral(0.9),
384            DCToken::FloatLiteral(1.23456789),
385        ];
386        lexer_test_for_target(
387            "
388            1 9 10 2010
389            01 07 07472
390            0xa 0xA 0Xa 0XA 0x123456789abcdef
391            0b1 0B1 0b0 0b010 0b101110
392            0.0 9.0 .0 .9 1.23456789
393            ",
394            target,
395        );
396    }
397
398    #[test]
399    fn text_literals() {
400        let target: Vec<DCToken> = vec![
401            // Character Literals
402            DCToken::CharacterLiteral('a'),
403            DCToken::CharacterLiteral('1'),
404            DCToken::CharacterLiteral('*'),
405            // String Literals
406            DCToken::StringLiteral(String::from("x")),
407            DCToken::StringLiteral(String::from("foo")),
408            DCToken::StringLiteral(String::from("*")),
409            // Escape Characters
410            DCToken::EscapeCharacter(String::from("\\n")),
411            DCToken::EscapeCharacter(String::from("\\t")),
412            DCToken::EscapeCharacter(String::from("\\xa19")),
413        ];
414        lexer_test_for_target(
415            "'a' '1' '*' \
416            \"x\" \"foo\" \"*\" \
417            \\n \\t \\xa19",
418            target,
419        );
420    }
421
422    #[test]
423    fn data_types() {
424        #[rustfmt::skip]
425        let target: Vec<DCToken> = vec![
426            DCToken::CharT,
427            // Signed / Unsigned Integers
428            DCToken::Int8T, DCToken::Int16T, DCToken::Int32T, DCToken::Int64T,
429            DCToken::UInt8T, DCToken::UInt16T, DCToken::UInt32T, DCToken::UInt64T,
430            // Floating Point
431            DCToken::Float32T,
432            DCToken::Float64T,
433            // Sized Types (string / blob)
434            DCToken::StringT,
435            DCToken::BlobT,
436        ];
437        lexer_test_for_target(
438            "char \
439            int8 int16 int32 int64 \
440            uint8 uint16 uint32 uint64 \
441            float32 float64 string blob",
442            target,
443        );
444    }
445
446    #[test]
447    fn operators_and_delimiters() {
448        let target: Vec<DCToken> = vec![
449            // Operators
450            DCToken::Percent,
451            DCToken::Star,
452            DCToken::Plus,
453            DCToken::Hyphen,
454            DCToken::ForwardSlash,
455            // Delimiters
456            DCToken::OpenParenthesis,
457            DCToken::CloseParenthesis,
458            DCToken::OpenBraces,
459            DCToken::CloseBraces,
460            DCToken::OpenBrackets,
461            DCToken::CloseBrackets,
462            DCToken::Comma,
463            DCToken::Colon,
464        ];
465        lexer_test_for_target("%*+-/(){}[],:", target);
466    }
467
468    #[test]
469    fn dc_keywords_tokens() {
470        let target: Vec<DCToken> = vec![
471            DCToken::DCKeyword("ram".to_string()),
472            DCToken::DCKeyword("required".to_string()),
473            DCToken::DCKeyword("db".to_string()),
474            DCToken::DCKeyword("airecv".to_string()),
475            DCToken::DCKeyword("ownrecv".to_string()),
476            DCToken::DCKeyword("clrecv".to_string()),
477            DCToken::DCKeyword("broadcast".to_string()),
478            DCToken::DCKeyword("ownsend".to_string()),
479            DCToken::DCKeyword("clsend".to_string()),
480        ];
481        lexer_test_for_target(
482            "ram required db airecv ownrecv \
483            clrecv broadcast ownsend clsend",
484            target,
485        );
486    }
487
488    #[test]
489    #[should_panic]
490    fn unexpected_token_test() {
491        let test_string: String = String::from("uint8 invalid_token = \\");
492        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
493
494        for (_, (_token, _span)) in lexer.enumerate() {
495            // iterate through lexer tokens until we get a panic
496        }
497    }
498
499    #[test]
500    fn register_newline() {
501        let test_string: String = String::from("keyword\nkeyword\nkeyword");
502        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
503
504        for (i, (_, span)) in lexer.enumerate() {
505            // We use one token every line, so our line # should match our index.
506            if span.line != i + 1 {
507                panic!("Lexer failed to register a new line!");
508            }
509        }
510    }
511}