From 63d84b1edb6557afb2a484266e476642964bce0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20R=C3=B6ger?= Date: Sun, 3 Nov 2024 20:24:36 +0100 Subject: [PATCH] add tokenizer --- src/main.rs | 10 +- src/parser/mod.rs | 2 + src/parser/token.rs | 14 ++ src/parser/tokenizer.rs | 358 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 src/parser/mod.rs create mode 100644 src/parser/token.rs create mode 100644 src/parser/tokenizer.rs diff --git a/src/main.rs b/src/main.rs index e7a11a9..456a5eb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,11 @@ +mod parser; + fn main() { - println!("Hello, world!"); + let mut test = "(add 10 (sub 1.1 200.5)) (concat-if true \"true\" 'nil (a . b))".chars(); + + let mut tkns = parser::tokenizer::tokenize(&mut test); + + while let Some(tk) = tkns.next() { + println!("{:?}", tk); + } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..486ea3d --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,2 @@ +pub mod token; +pub mod tokenizer; diff --git a/src/parser/token.rs b/src/parser/token.rs new file mode 100644 index 0000000..4d7643a --- /dev/null +++ b/src/parser/token.rs @@ -0,0 +1,14 @@ +#[derive(Debug, PartialEq, Clone)] +/// Sum type of different tokens +pub enum Token { + FloatLiteral(f64), + IntLiteral(i64), + Dot, + Nil, + ParClose, + ParOpen, + Quote, + StringLiteral(String), + Symbol(String), + True, +} diff --git a/src/parser/tokenizer.rs b/src/parser/tokenizer.rs new file mode 100644 index 0000000..8416817 --- /dev/null +++ b/src/parser/tokenizer.rs @@ -0,0 +1,358 @@ +use super::token::Token; + +#[derive(Debug, Clone)] +/// Errors the tokenizer can yield. +pub enum TokenizerError { + /// The tokenizer could not read the associated sequence. + UnmatchedSequence(String), +} + +/// A reader used to wrap the `TokenStream`. +/// When reading, it starts with the staging buffer of the stream, once +/// it's end is reached, the input stream is copied character wise to +/// the staging buffer. +struct StagingReader<'a, I> { + head: usize, + stream: &'a mut TokenStream, +} + +impl<'a, I> StagingReader<'a, I> +where + I: Iterator, +{ + /// Create a new StagingReader for a stream. + fn new(stream: &'a mut TokenStream) -> StagingReader<'a, I> { + StagingReader { head: 0, stream } + } + /// Step back the reader's head by `n` chars, stopping at 0 + fn step_back(&mut self, n: usize) { + if self.head >= n { + self.head -= n; + } + } +} + +impl<'a, I> Iterator for StagingReader<'a, I> +where + I: Iterator, +{ + type Item = char; + + /// Get the char at `self.head`. If it is in the staging buffer, return it and increase `self.head` by 1. + /// It it is not in the staging buffer, copy one char from the input stream to the staging buffer. + /// Returns `None` when the input stream is empty and `self.head` points after the staging buffer. + fn next(&mut self) -> Option { + if let Some(c) = self.stream.staging.get(self.head) { + self.head += 1; + Some(*c) + } else { + let next_char = self.stream.input.next()?; + self.stream.staging.push(next_char); + self.head += 1; + Some(next_char) + } + } +} + +/// An iterator yielding tokens scanned from a stream of characters. +pub struct TokenStream { + staging: Vec, + input: InputStream, + error: bool, +} + +impl TokenStream +where + I: Iterator, +{ + fn new(input: I) -> TokenStream { + TokenStream { + staging: Vec::new(), + input, + error: false, + } + } + + fn skip_whitespace(&mut self) { + // Drop whitespace of the staging buffer + while let Some(c) = self.staging.first() { + if c.is_whitespace() { + self.staging.remove(0); + } else { + return; // Readable character next, keep input untouched + } + } + + // Staging buffer is empty, drop whitespace from input + while let Some(c) = self.input.next() { + if !c.is_whitespace() { + self.staging.push(c); + return; + } + } + } + + fn run_scanners(&mut self) -> Option<(Token, usize)> { + let scanners = [ + scan_symbol, + scan_string_literal, + scan_integer, + scan_float, + scan_true, + scan_quote, + scan_dot, + scan_nil, + scan_par_close, + scan_par_open, + ]; + + scanners + .iter() + .filter_map(|scanner| { + let mut reader = StagingReader::new(self); + let token = scanner(&mut reader)?; + Some((token, reader.head)) + }) + .max_by_key(|pair| pair.1) + } +} + +impl Iterator for TokenStream +where + I: Iterator, +{ + type Item = Result; + + /// Get the next scanned token, consuming as much characters from the + /// wrapped input stream as neccessary. If nothing could be scanned and the input + /// stream has still elements an error is returned. Each successive call to + /// `next` will then return `None`. + fn next(&mut self) -> Option { + if self.error { + return None; + } + + self.skip_whitespace(); + + match self.run_scanners() { + Some((tkn, n_read)) => { + self.staging.drain(0..n_read); + Some(Ok(tkn)) + } + None if self.staging.is_empty() => None, + None => { + let remaining = self.staging.iter().collect(); + self.staging.clear(); + self.error = true; + Some(Err(TokenizerError::UnmatchedSequence(remaining))) + } + } + } +} + +/// Run the tokenizer on an iterator of chars and return an +/// iterator of tokens as a result. +pub fn tokenize(input: I) -> TokenStream +where + I: Iterator, +{ + TokenStream::new(input) +} + +fn scan_par_open(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + match reader.next()? { + '(' => Some(Token::ParOpen), + _ => { + reader.step_back(1); + None + } + } +} + +fn scan_par_close(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + match reader.next()? { + ')' => Some(Token::ParClose), + _ => { + reader.step_back(1); + None + } + } +} + +fn scan_dot(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + match reader.next()? { + '.' => Some(Token::Dot), + _ => { + reader.step_back(1); + None + } + } +} + +fn scan_string_literal(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + let mut lit = String::new(); + + if reader.next()? == '"' { + for c in reader { + match c { + '"' => { + return Some(Token::StringLiteral(lit)); + } + c => { + lit.push(c); + } + } + } + } + + return None; +} + +fn scan_nil(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + if reader.next()? == 'n' && reader.next()? == 'i' && reader.next()? == 'l' { + Some(Token::Nil) + } else { + reader.step_back(3); + None + } +} + +fn scan_quote(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + if let Some('\'') = reader.next() { + Some(Token::Quote) + } else { + reader.step_back(1); + None + } +} + +fn scan_symbol(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + let mut sym = String::new(); + + while let Some(c) = reader.next() { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + sym.push(c); + } else { + reader.step_back(1); + break; + } + } + + if sym.len() > 0 { + Some(Token::Symbol(sym)) + } else { + None + } +} + +fn scan_true(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + if reader.next()? == 't' + && reader.next()? == 'r' + && reader.next()? == 'u' + && reader.next()? == 'e' + { + Some(Token::True) + } else { + reader.step_back(4); + None + } +} + +fn scan_integer(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + let mut buf = String::new(); + + while let Some(c) = reader.next() { + if c.is_ascii_digit() { + buf.push(c); + } else { + reader.step_back(1); + break; + } + } + + if buf.len() > 0 { + buf.parse().map(Token::IntLiteral).ok() + } else { + None + } +} + +fn scan_float(reader: &mut StagingReader) -> Option +where + I: Iterator, +{ + let mut buf = String::new(); + let mut has_dot = false; + + while let Some(c) = reader.next() { + if c.is_ascii_digit() { + buf.push(c); + } else if c == '.' && !has_dot { + buf.push(c); + has_dot = true; + } else { + reader.step_back(1); + break; + } + } + + if buf.len() > 0 && has_dot { + buf.parse().map(Token::FloatLiteral).ok() + } else { + None + } +} + +#[test] +fn test_tokenize() { + let test_str = "(\"abcdefg( )123\" )(\n\t 'nil true \"true\")00987463 123.125 ."; + + let result: Vec<_> = tokenize(&mut test_str.chars()).collect(); + + assert_eq!(result.len(), 12); + assert_eq!(result[0].clone().unwrap(), Token::ParOpen); + assert_eq!( + result[1].clone().unwrap(), + Token::StringLiteral(String::from("abcdefg( )123")) + ); + assert_eq!(result[2].clone().unwrap(), Token::ParClose); + assert_eq!(result[3].clone().unwrap(), Token::ParOpen); + assert_eq!(result[4].clone().unwrap(), Token::Quote); + assert_eq!(result[5].clone().unwrap(), Token::Nil); + assert_eq!(result[6].clone().unwrap(), Token::True); + assert_eq!( + result[7].clone().unwrap(), + Token::StringLiteral(String::from("true")) + ); + assert_eq!(result[8].clone().unwrap(), Token::ParClose); + assert_eq!(result[9].clone().unwrap(), Token::IntLiteral(987463)); + assert_eq!(result[10].clone().unwrap(), Token::FloatLiteral(123.125)); + assert_eq!(result[11].clone().unwrap(), Token::Dot); +}