Lexer supports multi-byte utf8 characters and "pi", "π" and "e"
This commit is contained in:
parent
1fe9214a62
commit
424e6988e9
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -29,6 +29,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"decimal 2.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"decimal 2.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"nom 5.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"nom 5.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"unicode-segmentation 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -127,6 +128,11 @@ name = "static_assertions"
|
|||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
@ -151,4 +157,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||||
"checksum serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)" = "1217f97ab8e8904b57dd22eb61cde455fa7446a9c1cf43966066da047c1f3702"
|
"checksum serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)" = "1217f97ab8e8904b57dd22eb61cde455fa7446a9c1cf43966066da047c1f3702"
|
||||||
"checksum static_assertions 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
|
"checksum static_assertions 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
|
||||||
|
"checksum unicode-segmentation 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
|
||||||
"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
|
"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
|
||||||
|
|||||||
@ -6,4 +6,5 @@ edition = "2018"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
nom = "5.0"
|
nom = "5.0"
|
||||||
|
unicode-segmentation = "1.6.0"
|
||||||
decimal = "2.0.4"
|
decimal = "2.0.4"
|
||||||
|
|||||||
48
src/lexer.rs
48
src/lexer.rs
@ -1,13 +1,16 @@
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use decimal::d128;
|
use decimal::d128;
|
||||||
use crate::{Token, TokenVector, Operator::*};
|
use crate::{Token, TokenVector};
|
||||||
|
use crate::Operator::{Caret, Divide, Factorial, LeftParen, Minus, Multiply, PercentOrModulo, Plus, RightParen};
|
||||||
|
use crate::Constant::{Pi, EulersNumber};
|
||||||
|
|
||||||
pub fn lex(input: &str) -> Result<TokenVector, String> {
|
pub fn lex(input: &str) -> Result<TokenVector, String> {
|
||||||
|
|
||||||
let mut chars = input.chars().enumerate().peekable();
|
let mut chars = input.chars().enumerate().peekable();
|
||||||
let mut tokens: TokenVector = vec![];
|
let mut tokens: TokenVector = vec![];
|
||||||
|
|
||||||
while let Some((index, current_char)) = chars.next() {
|
let mut byte_index = 0;
|
||||||
|
while let Some((_index, current_char)) = chars.next() {
|
||||||
match current_char {
|
match current_char {
|
||||||
'+' => tokens.push(Token::Operator(Plus)),
|
'+' => tokens.push(Token::Operator(Plus)),
|
||||||
'-' => tokens.push(Token::Operator(Minus)),
|
'-' => tokens.push(Token::Operator(Minus)),
|
||||||
@ -18,16 +21,37 @@ pub fn lex(input: &str) -> Result<TokenVector, String> {
|
|||||||
'!' => tokens.push(Token::Operator(Factorial)),
|
'!' => tokens.push(Token::Operator(Factorial)),
|
||||||
'(' => tokens.push(Token::Operator(LeftParen)),
|
'(' => tokens.push(Token::Operator(LeftParen)),
|
||||||
')' => tokens.push(Token::Operator(RightParen)),
|
')' => tokens.push(Token::Operator(RightParen)),
|
||||||
|
'π' => tokens.push(Token::Constant(Pi)),
|
||||||
',' => continue,
|
',' => continue,
|
||||||
value if value.is_whitespace() => continue,
|
value if value.is_whitespace() => continue,
|
||||||
value if value.is_alphabetic() => {
|
value if value.is_alphabetic() => {
|
||||||
|
|
||||||
|
let start_index = byte_index;
|
||||||
|
let mut end_index = byte_index;
|
||||||
|
while let Some((_index, current_char)) = chars.peek() {
|
||||||
|
if current_char.is_alphabetic() {
|
||||||
|
chars.next();
|
||||||
|
end_index += 1;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let string = &input[start_index..=end_index];
|
||||||
|
match string {
|
||||||
|
"pi" => tokens.push(Token::Constant(Pi)),
|
||||||
|
"e" => tokens.push(Token::Constant(EulersNumber)),
|
||||||
|
_ => {
|
||||||
|
return Err(format!("Invalid string: {}", string))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
},
|
},
|
||||||
'.' | '0'..='9' => {
|
'.' | '0'..='9' => {
|
||||||
|
|
||||||
let start_index = index;
|
let start_index = byte_index;
|
||||||
let mut end_index = index+1;
|
let mut end_index = byte_index;
|
||||||
while let Some((_idk, current_char)) = chars.peek() {
|
while let Some((_index, current_char)) = chars.peek() {
|
||||||
if current_char == &'.' || current_char.is_digit(10) {
|
if current_char == &'.' || current_char.is_digit(10) {
|
||||||
chars.next();
|
chars.next();
|
||||||
end_index += 1;
|
end_index += 1;
|
||||||
@ -36,24 +60,30 @@ pub fn lex(input: &str) -> Result<TokenVector, String> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match d128::from_str(&input[start_index..end_index]) {
|
let number_string = &input[start_index..=end_index];
|
||||||
|
match d128::from_str(number_string) {
|
||||||
Ok(number) => {
|
Ok(number) => {
|
||||||
if d128::get_status().is_empty() {
|
if d128::get_status().is_empty() {
|
||||||
tokens.push(Token::Number(number));
|
tokens.push(Token::Number(number));
|
||||||
} else {
|
} else {
|
||||||
return Err(format!("Error parsing d128 number: {}", &input[start_index..end_index]));
|
return Err(format!("Error parsing d128 number: {}", number_string));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Err(_e) => {
|
Err(_e) => {
|
||||||
return Err("Error parsing d128 number (This should not happen because d128 does not throw errors)".to_owned());
|
return Err(format!("Error parsing d128 number: {}", number_string));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
},
|
},
|
||||||
_ => {
|
_ => {
|
||||||
return Err(format!("Unknown character: {}", current_char));
|
return Err(format!("Invalid character: {}", current_char));
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
// The π character, for example, is more than one byte, so in that case
|
||||||
|
// byte_index needs to be incremented by 2. This is because we're slicing
|
||||||
|
// strings to get digits/words, and Rust slices bytes, not utf8 graphemes
|
||||||
|
// (aka "user-perceived characters").
|
||||||
|
byte_index += current_char.len_utf8();
|
||||||
};
|
};
|
||||||
return Ok(tokens)
|
return Ok(tokens)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,10 +14,18 @@ pub enum Operator {
|
|||||||
RightParen,
|
RightParen,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum Constant {
|
||||||
|
Pi,
|
||||||
|
EulersNumber,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum Token {
|
pub enum Token {
|
||||||
Operator(Operator),
|
Operator(Operator),
|
||||||
Number(d128),
|
Number(d128),
|
||||||
|
Function(String),
|
||||||
|
Constant(Constant),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type TokenVector = Vec<Token>;
|
pub type TokenVector = Vec<Token>;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user