donet_core/parser/
lexer.rs

1/*
2    This file is part of Donet.
3
4    Copyright © 2024 Max Rodriguez <[email protected]>
5
6    Donet is free software; you can redistribute it and/or modify
7    it under the terms of the GNU Affero General Public License,
8    as published by the Free Software Foundation, either version 3
9    of the License, or (at your option) any later version.
10
11    Donet is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14    GNU Affero General Public License for more details.
15
16    You should have received a copy of the GNU Affero General Public
17    License along with Donet. If not, see <https://www.gnu.org/licenses/>.
18*/
19
20//! Definition of the Lexer machine to process raw DC file
21//! string data into a stream of lexical tokens for the DC parser.
22
23use crate::globals::{DC_VIEW_SUFFIXES, HISTORICAL_DC_KEYWORDS};
24use plex::lexer;
25
26#[rustfmt::skip]
27#[derive(Debug, Clone, PartialEq)]
28pub enum DCToken {
29    Whitespace,       // Not a DC token; Ignores: " " | tab | carriage-return
30    Comment,          // Not a DC token; Ignored. Satisfies lexer match.
31    Newline,          // Not a DC token; Used by lexer iterator to keep track of line #.
32    // Letter   ::= "A" ... "z"
33    // DecDigit ::= "0" ... "9"
34    // OctDigit ::= "0" ... "7"
35    // HexDigit ::= "0" ... "9" | "A" ... "F" | "a" ... "f"
36    // BinDigit ::= "0" | "1"
37
38    // Integers
39    BooleanLiteral(bool),  // "true" | "false"
40    DecimalLiteral(i64),   // ( "1" … "9" ) { DecDigit }
41    OctalLiteral(String),  // "0" { OctDigit }
42    HexLiteral(String),    // "0" ( "x" | "X" ) HexDigit { HexDigit }
43    BinaryLiteral(String), // "0" ( "b" | "B" ) BinDigit { BinDigit }
44
45    // IntegerLiteral ::= DecimalLiteral | OctalLiteral | HexLiteral | BinaryLiteral
46    // NumberLiteral  ::= IntegerLiteral | FloatLiteral
47    // decimals       ::= DecDigit { DecDigit }
48
49    // Floats
50    FloatLiteral(f64), // decimals "." [ decimals ] | "." [ decimals ]
51
52    // Text Literals
53    CharacterLiteral(char),
54    StringLiteral(String),
55    // nonSingleQuote  ::= <any printable character except "'" or newline>
56    // nonDoubleQuote  ::= <any printable character except `"` or newline>
57    EscapeCharacter(String), // "\" ( <any character> | "x" hexDigit { hexDigit } )
58
59    // Data Types
60    CharT,             // "char"
61    Int8T,             // "int8"
62    Int16T,            // "int16"
63    Int32T,            // "int32"
64    Int64T,            // "int64"
65    UInt8T,            // "uint8"
66    BoolT,             // "bool" (unique to donet. alias for uint8)
67    UInt16T,           // "uint16"
68    UInt32T,           // "uint32"
69    UInt64T,           // "uint64"
70    Float32T,          // "float32" (used on Astron)
71    Float64T,          // "float64"
72    Int8ArrayT,        // "int8array"
73    Int16ArrayT,       // "int16array"
74    Int32ArrayT,       // "int32array"
75    UInt8ArrayT,       // "uint8array"
76    UInt16ArrayT,      // "uint16array"
77    UInt32ArrayT,      // "uint32array"
78    UInt32UInt8ArrayT, // "uint32uint8array"
79    StringT,           // "string"
80    BlobT,             // "blob"
81    Blob32T,           // "blob32" (used on Panda)
82
83    // Keywords
84    DClass,  // "dclass"
85    Struct,  // "struct"
86    Keyword, // "keyword"
87    Typedef, // "typedef"
88    From,    // "from"
89    Import,  // "import"
90    Switch,  // "switch"
91    Case,    // "case"
92    Default, // "default"
93    Break,   // "break"
94
95    Identifier(String), // ( Letter | "_" ) { Letter | DecDigit | "_" }
96    DCKeyword(String),  // ( "ram" | "required" | "db" | "airecv" | "ownrecv" |
97                        //   "clrecv" | "broadcast" | "ownsend" | "clsend" )
98    ViewSuffix(String), // ( "AI", "OV", "UD" )
99
100    // Operators
101    Percent,      // "%"
102    Star,         // "*"
103    Plus,         // "+"
104    Hyphen,       // "-"
105    ForwardSlash, // "/"
106    Period,       // "."
107
108    // Delimiters
109    OpenParenthesis,  // "("
110    CloseParenthesis, // ")"
111    OpenBraces,       // "{"
112    CloseBraces,      // "}"
113    OpenBrackets,     // "["
114    CloseBrackets,    // "]"
115    Comma,            // ","
116    Semicolon,        // ";"
117    Equals,           // "="
118    Colon,            // ":"
119}
120
121lexer! {
122    fn next_token(text: 'a) -> (DCToken, &'a str);
123
124    r#"[ \t\r]+"# => (DCToken::Whitespace, text),
125    // C++-style comments '// ...'
126    r#"//[^\n]*"# => (DCToken::Comment, text),
127    // C-style comments '/* ... */'; cannot contain '*/'
128    r#"/[*](~(.*[*]/.*))[*]/"# => (DCToken::Comment, text),
129    r#"\n"# => (DCToken::Newline, text),
130
131    r#"true"# => (DCToken::BooleanLiteral(true), text),
132    r#"false"# => (DCToken::BooleanLiteral(false), text),
133
134    r#"0|([1-9][0-9]*)"# => (DCToken::DecimalLiteral(match text.parse::<i64>() {
135        Ok(n) => { n },
136        Err(err) => {
137            panic!("dclexer: Found DecimalLiteral token, but failed to parse as i64.\n\n{}", err);
138        },
139    }), text),
140
141    r#"0[0-7]+"# => (DCToken::OctalLiteral(text.to_owned()), text),
142    r#"0[xX][0-9a-fA-F]+"# => (DCToken::HexLiteral(text.to_owned()), text),
143    r#"0[bB][0-1]+"# => (DCToken::BinaryLiteral(text.to_owned()), text),
144
145    r#"([0-9]?)+\.[0-9]+"# => (DCToken::FloatLiteral(match text.parse::<f64>() {
146        Ok(f) => { f },
147        Err(err) => {
148            panic!("dclexer: Found FloatLiteral token, but failed to parse as f64.\n\n{}", err);
149        }
150    }), text),
151
152    // Rust doesn't support lookahead/lookbehind regex, so for character literals
153    // we match the entire ''x'' and extract the second (nth(1)) character.
154    r#"'.'"# => (DCToken::CharacterLiteral(text.chars().nth(1).unwrap()), text),
155    // Note that there is no need to escape double quotes in rust regex.
156    r#""[^"]*""# => (DCToken::StringLiteral(text.to_owned().replace('\"', "")), text),
157
158    // Signed/unsigned integer data types *could* be a single token,
159    // but parsing is easier if they are all individual lexical tokens.
160    r#"char"# => (DCToken::CharT, text),
161    r#"int8"# => (DCToken::Int8T, text),
162    r#"int16"# => (DCToken::Int16T, text),
163    r#"int32"# => (DCToken::Int32T, text),
164    r#"int64"# => (DCToken::Int64T, text),
165    r#"uint8"# => (DCToken::UInt8T, text),
166    r#"bool"# => (DCToken::BoolT, text),
167    r#"uint16"# => (DCToken::UInt16T, text),
168    r#"uint32"# => (DCToken::UInt32T, text),
169    r#"uint64"# => (DCToken::UInt64T, text),
170    r#"float32"# => (DCToken::Float32T, text),
171    r#"float64"# => (DCToken::Float64T, text),
172    r#"int8array"# => (DCToken::Int8ArrayT, text),
173    r#"int16array"# => (DCToken::Int16ArrayT, text),
174    r#"int32array"# => (DCToken::Int32ArrayT, text),
175    r#"uint8array"# => (DCToken::UInt8ArrayT, text),
176    r#"uint16array"# => (DCToken::UInt16ArrayT, text),
177    r#"uint32array"# => (DCToken::UInt32ArrayT, text),
178    r#"uint32uint8array"# => (DCToken::UInt32UInt8ArrayT, text),
179    r#"string"# => (DCToken::StringT, text),
180    r#"blob"# => (DCToken::BlobT, text),
181    r#"blob32"# => (DCToken::Blob32T, text),
182
183    r#"dclass"# => (DCToken::DClass, text),
184    r#"struct"# => (DCToken::Struct, text),
185    r#"keyword"# => (DCToken::Keyword, text),
186    r#"from"# => (DCToken::From, text),
187    r#"import"# => (DCToken::Import, text),
188    r#"typedef"# => (DCToken::Typedef, text),
189    r#"switch"# => (DCToken::Switch, text),
190    r#"case"# => (DCToken::Case, text),
191    r#"default"# => (DCToken::Default, text),
192    r#"break"# => (DCToken::Break, text),
193
194    r#"[a-zA-Z_][a-zA-Z0-9_]*"# => {
195        // Decide whether this is an identifier, keyword, or view suffix.
196        if HISTORICAL_DC_KEYWORDS.contains(&text) {
197            (DCToken::DCKeyword(text.to_owned()), text)
198        } else if DC_VIEW_SUFFIXES.contains(&text) {
199            (DCToken::ViewSuffix(text.to_owned()), text)
200        } else {
201            (DCToken::Identifier(text.to_owned()), text)
202        }
203    },
204    r#"\\(x[0-9a-fA-F]+|.)"# => (DCToken::EscapeCharacter(text.to_owned()), text),
205
206    r#"%"# => (DCToken::Percent, text),
207    r#"\*"# => (DCToken::Star, text),
208    r#"\+"# => (DCToken::Plus, text),
209    r#"-"# => (DCToken::Hyphen, text),
210    r#"/"# => (DCToken::ForwardSlash, text),
211    r#"\."# => (DCToken::Period, text),
212
213    r#"\("# => (DCToken::OpenParenthesis, text),
214    r#"\)"# => (DCToken::CloseParenthesis, text),
215    r#"\{"# => (DCToken::OpenBraces, text),
216    r#"\}"# => (DCToken::CloseBraces, text),
217    r#"\["# => (DCToken::OpenBrackets, text),
218    r#"\]"# => (DCToken::CloseBrackets, text),
219    r#"\,"# => (DCToken::Comma, text),
220    r#"\;"# => (DCToken::Semicolon, text),
221    r#"\="# => (DCToken::Equals, text),
222    r#"\:"# => (DCToken::Colon, text),
223    r#"."# => {
224        panic!("dclexer: Found an unexpected character: '{}'", text);
225    }
226}
227
228pub struct Lexer<'a> {
229    original: &'a str,
230    remaining: &'a str,
231    line: usize,
232}
233
234impl<'a> Lexer<'a> {
235    pub fn new(s: &'a str) -> Lexer<'a> {
236        Lexer {
237            original: s,
238            remaining: s,
239            line: 1,
240        }
241    }
242}
243
244#[derive(Debug, PartialEq, Clone, Copy)]
245pub struct Span {
246    pub min: usize,
247    pub max: usize,
248    pub line: usize,
249}
250
251impl std::fmt::Display for Span {
252    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
253        writeln!(f, "--- SPAN ---")?;
254        write!(f, "line: ")?;
255        self.line.fmt(f)?;
256        write!(f, ", min: ")?;
257        self.min.fmt(f)?;
258        write!(f, ", max: ")?;
259        self.max.fmt(f)
260    }
261}
262
263fn span_in(s: &str, t: &str, l: usize) -> Span {
264    let min = s.as_ptr() as usize - t.as_ptr() as usize;
265    Span {
266        min,
267        max: min + s.len(),
268        line: l,
269    }
270}
271
272impl Iterator for Lexer<'_> {
273    type Item = (DCToken, Span);
274    fn next(&mut self) -> Option<(DCToken, Span)> {
275        loop {
276            let tok: (DCToken, &str) = if let Some((tok, new_remaining)) = next_token(self.remaining) {
277                self.remaining = new_remaining;
278                tok
279            } else {
280                return None;
281            };
282            match tok {
283                (DCToken::Whitespace, _) | (DCToken::Comment, _) => {
284                    // These tokens are ignored by the lexer.
285                    continue;
286                }
287                (DCToken::Newline, _) => {
288                    self.line += 1;
289                    continue;
290                }
291                (tok, span) => {
292                    return Some((tok, span_in(span, self.original, self.line)));
293                }
294            }
295        }
296    }
297}
298
299#[cfg(test)]
300mod tests {
301    use super::{DCToken, Lexer};
302
303    // Utility for unit testing lexer. Gives the test_string to the lexer
304    // and compares the lexer results with the target_tokens vector given.
305    fn lexer_test_for_target(test_string: &str, target_tokens: Vec<DCToken>) {
306        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
307        let mut token_quota_reached: bool = false;
308
309        for (i, (token, _span)) in lexer.enumerate() {
310            if i >= (target_tokens.len() - 1) {
311                token_quota_reached = true;
312            }
313            if i > (target_tokens.len() - 1) {
314                panic!("Lexer returned more tokens than expected!");
315            }
316            assert_eq!(token, *target_tokens.get(i).unwrap());
317        }
318        if !token_quota_reached {
319            panic!("Did not receive all the expected tokens!");
320        }
321    }
322
323    #[test]
324    fn ignored_tokens_test() {
325        // Covers Whitespace, Comment (C and C++ style), and Newline
326        let test_string: String = String::from(
327            "// Single line comment\n\
328            /* multiline comment*/\n\
329            \n    \n",
330        );
331        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
332
333        for (_token, _span) in lexer {
334            panic!("No tokens should have been returned by the lexer!");
335        }
336    }
337
338    #[test]
339    fn keyword_definition_test() {
340        let target: Vec<DCToken> = vec![
341            DCToken::Keyword,
342            DCToken::Identifier(String::from("test")),
343            DCToken::Semicolon,
344            // Just need to cover these two tokens for coverage.
345            DCToken::DClass,
346            DCToken::Struct,
347        ];
348        lexer_test_for_target("keyword test;\n dclass struct", target);
349    }
350
351    #[test]
352    fn switch_tokens_test() {
353        let target: Vec<DCToken> = vec![DCToken::Switch, DCToken::Case, DCToken::Default, DCToken::Break];
354        lexer_test_for_target("switch case default break", target);
355    }
356
357    #[test]
358    fn dclass_import_statement() {
359        // We will not be making use of import statements, as it is
360        // a client thing (both client and AI do this), but we still want
361        // their DC files to pass our lexer / parser without issues.
362        let target: Vec<DCToken> = vec![
363            DCToken::From,
364            DCToken::Identifier(String::from("views")),
365            DCToken::Period,
366            DCToken::Identifier(String::from("Donut")),
367            DCToken::Import,
368            DCToken::Identifier(String::from("DistributedDonut")),
369            DCToken::ForwardSlash,
370            DCToken::ViewSuffix(String::from("AI")),
371            DCToken::ForwardSlash,
372            DCToken::ViewSuffix(String::from("OV")),
373        ];
374        lexer_test_for_target("from views.Donut import DistributedDonut/AI/OV", target);
375    }
376
377    #[test]
378    fn number_literals() {
379        let target: Vec<DCToken> = vec![
380            // Decimal Literals
381            DCToken::DecimalLiteral(1),
382            DCToken::DecimalLiteral(9),
383            DCToken::DecimalLiteral(10),
384            DCToken::DecimalLiteral(2010),
385            // Octal Literals
386            DCToken::OctalLiteral(String::from("01")),
387            DCToken::OctalLiteral(String::from("07")),
388            DCToken::OctalLiteral(String::from("07472")),
389            // Hex Literals
390            DCToken::HexLiteral(String::from("0xa")),
391            DCToken::HexLiteral(String::from("0xA")),
392            DCToken::HexLiteral(String::from("0Xa")),
393            DCToken::HexLiteral(String::from("0XA")),
394            DCToken::HexLiteral(String::from("0x123456789abcdef")),
395            // Binary Literals
396            DCToken::BinaryLiteral(String::from("0b1")),
397            DCToken::BinaryLiteral(String::from("0B1")),
398            DCToken::BinaryLiteral(String::from("0b0")),
399            DCToken::BinaryLiteral(String::from("0b010")),
400            DCToken::BinaryLiteral(String::from("0b101110")),
401            // Float Literal
402            DCToken::FloatLiteral(0.0),
403            DCToken::FloatLiteral(9.0),
404            DCToken::FloatLiteral(0.0),
405            DCToken::FloatLiteral(0.9),
406            DCToken::FloatLiteral(1.23456789),
407            // Boolean Literal
408            DCToken::BooleanLiteral(true),
409            DCToken::BooleanLiteral(false),
410        ];
411        lexer_test_for_target(
412            "
413            1 9 10 2010
414            01 07 07472
415            0xa 0xA 0Xa 0XA 0x123456789abcdef
416            0b1 0B1 0b0 0b010 0b101110
417            0.0 9.0 .0 .9 1.23456789
418            true false
419            ",
420            target,
421        );
422    }
423
424    #[test]
425    fn text_literals() {
426        let target: Vec<DCToken> = vec![
427            // Character Literals
428            DCToken::CharacterLiteral('a'),
429            DCToken::CharacterLiteral('1'),
430            DCToken::CharacterLiteral('*'),
431            // String Literals
432            DCToken::StringLiteral(String::from("x")),
433            DCToken::StringLiteral(String::from("foo")),
434            DCToken::StringLiteral(String::from("*")),
435            // Escape Characters
436            DCToken::EscapeCharacter(String::from("\\n")),
437            DCToken::EscapeCharacter(String::from("\\t")),
438            DCToken::EscapeCharacter(String::from("\\xa19")),
439        ];
440        lexer_test_for_target(
441            "'a' '1' '*' \
442            \"x\" \"foo\" \"*\" \
443            \\n \\t \\xa19",
444            target,
445        );
446    }
447
448    #[test]
449    fn data_types() {
450        #[rustfmt::skip]
451        let target: Vec<DCToken> = vec![
452            DCToken::CharT,
453            DCToken::BoolT,
454            // Signed / Unsigned Integers
455            DCToken::Int8T, DCToken::Int16T, DCToken::Int32T, DCToken::Int64T,
456            DCToken::UInt8T, DCToken::UInt16T, DCToken::UInt32T, DCToken::UInt64T,
457            // Array Data Types
458            DCToken::Int8ArrayT, DCToken::Int16ArrayT, DCToken::Int32ArrayT,
459            DCToken::UInt8ArrayT, DCToken::UInt16ArrayT, DCToken::UInt32ArrayT,
460            DCToken::UInt32UInt8ArrayT,
461            // Floating Point
462            DCToken::Float32T,
463            DCToken::Float64T,
464            // Sized Types (string / blob)
465            DCToken::StringT,
466            DCToken::BlobT,
467            DCToken::Blob32T,
468        ];
469        lexer_test_for_target(
470            "char bool \
471            int8 int16 int32 int64 \
472            uint8 uint16 uint32 uint64 \
473            int8array int16array int32array \
474            uint8array uint16array uint32array uint32uint8array \
475            float32 float64 string blob blob32",
476            target,
477        );
478    }
479
480    #[test]
481    fn operators_and_delimiters() {
482        let target: Vec<DCToken> = vec![
483            // Operators
484            DCToken::Percent,
485            DCToken::Star,
486            DCToken::Plus,
487            DCToken::Hyphen,
488            DCToken::ForwardSlash,
489            // Delimiters
490            DCToken::OpenParenthesis,
491            DCToken::CloseParenthesis,
492            DCToken::OpenBraces,
493            DCToken::CloseBraces,
494            DCToken::OpenBrackets,
495            DCToken::CloseBrackets,
496            DCToken::Comma,
497            DCToken::Colon,
498        ];
499        lexer_test_for_target("%*+-/(){}[],:", target);
500    }
501
502    #[test]
503    fn dc_keywords_tokens() {
504        let target: Vec<DCToken> = vec![
505            DCToken::DCKeyword("ram".to_string()),
506            DCToken::DCKeyword("required".to_string()),
507            DCToken::DCKeyword("db".to_string()),
508            DCToken::DCKeyword("airecv".to_string()),
509            DCToken::DCKeyword("ownrecv".to_string()),
510            DCToken::DCKeyword("clrecv".to_string()),
511            DCToken::DCKeyword("broadcast".to_string()),
512            DCToken::DCKeyword("ownsend".to_string()),
513            DCToken::DCKeyword("clsend".to_string()),
514        ];
515        lexer_test_for_target(
516            "ram required db airecv ownrecv \
517            clrecv broadcast ownsend clsend",
518            target,
519        );
520    }
521
522    #[test]
523    #[should_panic]
524    fn unexpected_token_test() {
525        let test_string: String = String::from("uint8 invalid_token = \\");
526        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
527
528        for (_, (_token, _span)) in lexer.enumerate() {
529            // iterate through lexer tokens until we get a panic
530        }
531    }
532
533    #[test]
534    fn register_newline() {
535        let test_string: String = String::from("keyword\nkeyword\nkeyword");
536        let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
537
538        for (i, (_, span)) in lexer.enumerate() {
539            // We use one token every line, so our line # should match our index.
540            if span.line != i + 1 {
541                panic!("Lexer failed to register a new line!");
542            }
543        }
544    }
545}