1use crate::globals::{DC_VIEW_SUFFIXES, HISTORICAL_DC_KEYWORDS};
24use plex::lexer;
25
26#[rustfmt::skip]
27#[derive(Debug, Clone, PartialEq)]
28pub enum DCToken {
29 Whitespace, Comment, Newline, DecimalLiteral(i64), OctalLiteral(String), HexLiteral(String), BinaryLiteral(String), FloatLiteral(f64), CharacterLiteral(char),
53 StringLiteral(String),
54 EscapeCharacter(String), CharT, Int8T, Int16T, Int32T, Int64T, UInt8T, UInt16T, UInt32T, UInt64T, Float32T, Float64T, StringT, BlobT, DClass, Struct, Keyword, Typedef, From, Import, Switch, Case, Default, Break, Identifier(String), DCKeyword(String), ViewSuffix(String), Percent, Star, Plus, Hyphen, ForwardSlash, Period, OpenParenthesis, CloseParenthesis, OpenBraces, CloseBraces, OpenBrackets, CloseBrackets, Comma, Semicolon, Equals, Colon, }
110
111lexer! {
112 fn next_token(text: 'a) -> (DCToken, &'a str);
113
114 r#"[ \t\r]+"# => (DCToken::Whitespace, text),
115 r#"//[^\n]*"# => (DCToken::Comment, text),
117 r#"/[*](~(.*[*]/.*))[*]/"# => (DCToken::Comment, text),
119 r#"\n"# => (DCToken::Newline, text),
120
121 r#"0|([1-9][0-9]*)"# => (DCToken::DecimalLiteral(match text.parse::<i64>() {
122 Ok(n) => { n },
123 Err(err) => {
124 panic!("dclexer: Found DecimalLiteral token, but failed to parse as i64.\n\n{}", err);
125 },
126 }), text),
127
128 r#"0[0-7]+"# => (DCToken::OctalLiteral(text.to_owned()), text),
129 r#"0[xX][0-9a-fA-F]+"# => (DCToken::HexLiteral(text.to_owned()), text),
130 r#"0[bB][0-1]+"# => (DCToken::BinaryLiteral(text.to_owned()), text),
131
132 r#"([0-9]?)+\.[0-9]+"# => (DCToken::FloatLiteral(match text.parse::<f64>() {
133 Ok(f) => { f },
134 Err(err) => {
135 panic!("dclexer: Found FloatLiteral token, but failed to parse as f64.\n\n{}", err);
136 }
137 }), text),
138
139 r#"'.'"# => (DCToken::CharacterLiteral(text.chars().nth(1).unwrap()), text),
142 r#""[^"]*""# => (DCToken::StringLiteral(text.to_owned().replace('\"', "")), text),
144
145 r#"char"# => (DCToken::CharT, text),
148 r#"int8"# => (DCToken::Int8T, text),
149 r#"int16"# => (DCToken::Int16T, text),
150 r#"int32"# => (DCToken::Int32T, text),
151 r#"int64"# => (DCToken::Int64T, text),
152 r#"uint8"# => (DCToken::UInt8T, text),
153 r#"uint16"# => (DCToken::UInt16T, text),
154 r#"uint32"# => (DCToken::UInt32T, text),
155 r#"uint64"# => (DCToken::UInt64T, text),
156 r#"float32"# => (DCToken::Float32T, text),
157 r#"float64"# => (DCToken::Float64T, text),
158 r#"string"# => (DCToken::StringT, text),
159 r#"blob"# => (DCToken::BlobT, text),
160
161 r#"dclass"# => (DCToken::DClass, text),
162 r#"struct"# => (DCToken::Struct, text),
163 r#"keyword"# => (DCToken::Keyword, text),
164 r#"from"# => (DCToken::From, text),
165 r#"import"# => (DCToken::Import, text),
166 r#"typedef"# => (DCToken::Typedef, text),
167 r#"switch"# => (DCToken::Switch, text),
168 r#"case"# => (DCToken::Case, text),
169 r#"default"# => (DCToken::Default, text),
170 r#"break"# => (DCToken::Break, text),
171
172 r#"[a-zA-Z_][a-zA-Z0-9_]*"# => {
173 if HISTORICAL_DC_KEYWORDS.contains(&text) {
175 (DCToken::DCKeyword(text.to_owned()), text)
176 } else if DC_VIEW_SUFFIXES.contains(&text) {
177 (DCToken::ViewSuffix(text.to_owned()), text)
178 } else {
179 (DCToken::Identifier(text.to_owned()), text)
180 }
181 },
182 r#"\\(x[0-9a-fA-F]+|.)"# => (DCToken::EscapeCharacter(text.to_owned()), text),
183
184 r#"%"# => (DCToken::Percent, text),
185 r#"\*"# => (DCToken::Star, text),
186 r#"\+"# => (DCToken::Plus, text),
187 r#"-"# => (DCToken::Hyphen, text),
188 r#"/"# => (DCToken::ForwardSlash, text),
189 r#"\."# => (DCToken::Period, text),
190
191 r#"\("# => (DCToken::OpenParenthesis, text),
192 r#"\)"# => (DCToken::CloseParenthesis, text),
193 r#"\{"# => (DCToken::OpenBraces, text),
194 r#"\}"# => (DCToken::CloseBraces, text),
195 r#"\["# => (DCToken::OpenBrackets, text),
196 r#"\]"# => (DCToken::CloseBrackets, text),
197 r#"\,"# => (DCToken::Comma, text),
198 r#"\;"# => (DCToken::Semicolon, text),
199 r#"\="# => (DCToken::Equals, text),
200 r#"\:"# => (DCToken::Colon, text),
201 r#"."# => {
202 panic!("dclexer: Found an unexpected character: '{}'", text);
203 }
204}
205
206pub struct Lexer<'a> {
207 original: &'a str,
208 remaining: &'a str,
209 line: usize,
210}
211
212impl<'a> Lexer<'a> {
213 pub fn new(s: &'a str) -> Lexer<'a> {
214 Lexer {
215 original: s,
216 remaining: s,
217 line: 1,
218 }
219 }
220}
221
222#[derive(Debug, PartialEq, Clone, Copy)]
223pub struct Span {
224 pub min: usize,
225 pub max: usize,
226 pub line: usize,
227}
228
229impl std::fmt::Display for Span {
230 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
231 writeln!(f, "--- SPAN ---")?;
232 write!(f, "line: ")?;
233 self.line.fmt(f)?;
234 write!(f, ", min: ")?;
235 self.min.fmt(f)?;
236 write!(f, ", max: ")?;
237 self.max.fmt(f)
238 }
239}
240
241fn span_in(s: &str, t: &str, l: usize) -> Span {
242 let min = s.as_ptr() as usize - t.as_ptr() as usize;
243 Span {
244 min,
245 max: min + s.len(),
246 line: l,
247 }
248}
249
250impl Iterator for Lexer<'_> {
251 type Item = (DCToken, Span);
252 fn next(&mut self) -> Option<(DCToken, Span)> {
253 loop {
254 let tok: (DCToken, &str) = if let Some((tok, new_remaining)) = next_token(self.remaining) {
255 self.remaining = new_remaining;
256 tok
257 } else {
258 return None;
259 };
260 match tok {
261 (DCToken::Whitespace, _) | (DCToken::Comment, _) => {
262 continue;
264 }
265 (DCToken::Newline, _) => {
266 self.line += 1;
267 continue;
268 }
269 (tok, span) => {
270 return Some((tok, span_in(span, self.original, self.line)));
271 }
272 }
273 }
274 }
275}
276
277#[cfg(test)]
278mod tests {
279 use super::{DCToken, Lexer};
280
281 fn lexer_test_for_target(test_string: &str, target_tokens: Vec<DCToken>) {
284 let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
285 let mut token_quota_reached: bool = false;
286
287 for (i, (token, _span)) in lexer.enumerate() {
288 if i >= (target_tokens.len() - 1) {
289 token_quota_reached = true;
290 }
291 if i > (target_tokens.len() - 1) {
292 panic!("Lexer returned more tokens than expected!");
293 }
294 assert_eq!(token, *target_tokens.get(i).unwrap());
295 }
296 if !token_quota_reached {
297 panic!("Did not receive all the expected tokens!");
298 }
299 }
300
301 #[test]
302 fn ignored_tokens_test() {
303 let test_string: String = String::from(
305 "// Single line comment\n\
306 /* multiline comment*/\n\
307 \n \n",
308 );
309 let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
310
311 for (_token, _span) in lexer {
312 panic!("No tokens should have been returned by the lexer!");
313 }
314 }
315
316 #[test]
317 fn keyword_definition_test() {
318 let target: Vec<DCToken> = vec![
319 DCToken::Keyword,
320 DCToken::Identifier(String::from("test")),
321 DCToken::Semicolon,
322 DCToken::DClass,
324 DCToken::Struct,
325 ];
326 lexer_test_for_target("keyword test;\n dclass struct", target);
327 }
328
329 #[test]
330 fn switch_tokens_test() {
331 let target: Vec<DCToken> = vec![DCToken::Switch, DCToken::Case, DCToken::Default, DCToken::Break];
332 lexer_test_for_target("switch case default break", target);
333 }
334
335 #[test]
336 fn dclass_import_statement() {
337 let target: Vec<DCToken> = vec![
341 DCToken::From,
342 DCToken::Identifier(String::from("views")),
343 DCToken::Period,
344 DCToken::Identifier(String::from("Donut")),
345 DCToken::Import,
346 DCToken::Identifier(String::from("DistributedDonut")),
347 DCToken::ForwardSlash,
348 DCToken::ViewSuffix(String::from("AI")),
349 DCToken::ForwardSlash,
350 DCToken::ViewSuffix(String::from("OV")),
351 ];
352 lexer_test_for_target("from views.Donut import DistributedDonut/AI/OV", target);
353 }
354
355 #[test]
356 fn number_literals() {
357 let target: Vec<DCToken> = vec![
358 DCToken::DecimalLiteral(1),
360 DCToken::DecimalLiteral(9),
361 DCToken::DecimalLiteral(10),
362 DCToken::DecimalLiteral(2010),
363 DCToken::OctalLiteral(String::from("01")),
365 DCToken::OctalLiteral(String::from("07")),
366 DCToken::OctalLiteral(String::from("07472")),
367 DCToken::HexLiteral(String::from("0xa")),
369 DCToken::HexLiteral(String::from("0xA")),
370 DCToken::HexLiteral(String::from("0Xa")),
371 DCToken::HexLiteral(String::from("0XA")),
372 DCToken::HexLiteral(String::from("0x123456789abcdef")),
373 DCToken::BinaryLiteral(String::from("0b1")),
375 DCToken::BinaryLiteral(String::from("0B1")),
376 DCToken::BinaryLiteral(String::from("0b0")),
377 DCToken::BinaryLiteral(String::from("0b010")),
378 DCToken::BinaryLiteral(String::from("0b101110")),
379 DCToken::FloatLiteral(0.0),
381 DCToken::FloatLiteral(9.0),
382 DCToken::FloatLiteral(0.0),
383 DCToken::FloatLiteral(0.9),
384 DCToken::FloatLiteral(1.23456789),
385 ];
386 lexer_test_for_target(
387 "
388 1 9 10 2010
389 01 07 07472
390 0xa 0xA 0Xa 0XA 0x123456789abcdef
391 0b1 0B1 0b0 0b010 0b101110
392 0.0 9.0 .0 .9 1.23456789
393 ",
394 target,
395 );
396 }
397
398 #[test]
399 fn text_literals() {
400 let target: Vec<DCToken> = vec![
401 DCToken::CharacterLiteral('a'),
403 DCToken::CharacterLiteral('1'),
404 DCToken::CharacterLiteral('*'),
405 DCToken::StringLiteral(String::from("x")),
407 DCToken::StringLiteral(String::from("foo")),
408 DCToken::StringLiteral(String::from("*")),
409 DCToken::EscapeCharacter(String::from("\\n")),
411 DCToken::EscapeCharacter(String::from("\\t")),
412 DCToken::EscapeCharacter(String::from("\\xa19")),
413 ];
414 lexer_test_for_target(
415 "'a' '1' '*' \
416 \"x\" \"foo\" \"*\" \
417 \\n \\t \\xa19",
418 target,
419 );
420 }
421
422 #[test]
423 fn data_types() {
424 #[rustfmt::skip]
425 let target: Vec<DCToken> = vec![
426 DCToken::CharT,
427 DCToken::Int8T, DCToken::Int16T, DCToken::Int32T, DCToken::Int64T,
429 DCToken::UInt8T, DCToken::UInt16T, DCToken::UInt32T, DCToken::UInt64T,
430 DCToken::Float32T,
432 DCToken::Float64T,
433 DCToken::StringT,
435 DCToken::BlobT,
436 ];
437 lexer_test_for_target(
438 "char \
439 int8 int16 int32 int64 \
440 uint8 uint16 uint32 uint64 \
441 float32 float64 string blob",
442 target,
443 );
444 }
445
446 #[test]
447 fn operators_and_delimiters() {
448 let target: Vec<DCToken> = vec![
449 DCToken::Percent,
451 DCToken::Star,
452 DCToken::Plus,
453 DCToken::Hyphen,
454 DCToken::ForwardSlash,
455 DCToken::OpenParenthesis,
457 DCToken::CloseParenthesis,
458 DCToken::OpenBraces,
459 DCToken::CloseBraces,
460 DCToken::OpenBrackets,
461 DCToken::CloseBrackets,
462 DCToken::Comma,
463 DCToken::Colon,
464 ];
465 lexer_test_for_target("%*+-/(){}[],:", target);
466 }
467
468 #[test]
469 fn dc_keywords_tokens() {
470 let target: Vec<DCToken> = vec![
471 DCToken::DCKeyword("ram".to_string()),
472 DCToken::DCKeyword("required".to_string()),
473 DCToken::DCKeyword("db".to_string()),
474 DCToken::DCKeyword("airecv".to_string()),
475 DCToken::DCKeyword("ownrecv".to_string()),
476 DCToken::DCKeyword("clrecv".to_string()),
477 DCToken::DCKeyword("broadcast".to_string()),
478 DCToken::DCKeyword("ownsend".to_string()),
479 DCToken::DCKeyword("clsend".to_string()),
480 ];
481 lexer_test_for_target(
482 "ram required db airecv ownrecv \
483 clrecv broadcast ownsend clsend",
484 target,
485 );
486 }
487
488 #[test]
489 #[should_panic]
490 fn unexpected_token_test() {
491 let test_string: String = String::from("uint8 invalid_token = \\");
492 let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
493
494 for (_, (_token, _span)) in lexer.enumerate() {
495 }
497 }
498
499 #[test]
500 fn register_newline() {
501 let test_string: String = String::from("keyword\nkeyword\nkeyword");
502 let lexer = Lexer::new(&test_string).inspect(|tok| eprintln!("token: {:?}", tok));
503
504 for (i, (_, span)) in lexer.enumerate() {
505 if span.line != i + 1 {
507 panic!("Lexer failed to register a new line!");
508 }
509 }
510 }
511}