253 lines
9.6 KiB
Rust
253 lines
9.6 KiB
Rust
use std::str::FromStr;
|
|
use decimal::d128;
|
|
use crate::{Token, TokenVector};
|
|
use crate::Operator::{Caret, Divide, LeftParen, Minus, Modulo, Multiply, Plus, RightParen};
|
|
use crate::UnaryOperator::{Percent, Factorial};
|
|
use crate::TextOperator::{Of, To};
|
|
use crate::Constant::{E, Pi};
|
|
use crate::FunctionIdentifier::{Cbrt, Ceil, Cos, Exp, Abs, Floor, Ln, Log, Round, Sin, Sqrt, Tan};
|
|
use crate::units::Unit::*;
|
|
|
|
pub fn lex(input: &str) -> Result<TokenVector, String> {
|
|
|
|
let mut chars = input.chars().enumerate().peekable();
|
|
let mut tokens: TokenVector = vec![];
|
|
let max_word_length = 12;
|
|
|
|
let mut left_paren_count = 0;
|
|
let mut right_paren_count = 0;
|
|
|
|
let mut byte_index = 0;
|
|
while let Some((_index, current_char)) = chars.next() {
|
|
match current_char {
|
|
'+' => tokens.push(Token::Operator(Plus)),
|
|
'-' => tokens.push(Token::Operator(Minus)),
|
|
'*' => tokens.push(Token::Operator(Multiply)),
|
|
'/' => tokens.push(Token::Operator(Divide)),
|
|
'%' => tokens.push(Token::Operator(Modulo)),
|
|
'^' => tokens.push(Token::Operator(Caret)),
|
|
'!' => tokens.push(Token::UnaryOperator(Factorial)),
|
|
'(' => {
|
|
left_paren_count += 1;
|
|
tokens.push(Token::Operator(LeftParen));
|
|
},
|
|
')' => {
|
|
right_paren_count += 1;
|
|
tokens.push(Token::Operator(RightParen));
|
|
},
|
|
'π' => tokens.push(Token::Constant(Pi)),
|
|
',' => {},
|
|
value if value.is_whitespace() => {},
|
|
value if value.is_alphabetic() => {
|
|
|
|
let start_index = byte_index;
|
|
let mut end_index = byte_index;
|
|
while let Some((_index, current_char)) = chars.peek() {
|
|
// don't loop more than max_word_length:
|
|
if end_index >= start_index + max_word_length - 1 { break; }
|
|
|
|
if current_char.is_alphabetic() {
|
|
byte_index += current_char.len_utf8();
|
|
chars.next();
|
|
end_index += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let string = &input[start_index..=end_index];
|
|
match string {
|
|
|
|
// MAKE SURE max_word_length IS EQUAL TO THE
|
|
// LENGTH OF THE LONGEST STRING IN THIS MATCH STATEMENT.
|
|
|
|
"to" => tokens.push(Token::TextOperator(To)),
|
|
"of" => tokens.push(Token::TextOperator(Of)),
|
|
|
|
"pi" => tokens.push(Token::Constant(Pi)),
|
|
"e" => tokens.push(Token::Constant(E)),
|
|
|
|
"mod" => tokens.push(Token::Operator(Modulo)),
|
|
|
|
"sqrt" => tokens.push(Token::FunctionIdentifier(Sqrt)),
|
|
"cbrt" => tokens.push(Token::FunctionIdentifier(Cbrt)),
|
|
|
|
"log" => tokens.push(Token::FunctionIdentifier(Log)),
|
|
"ln" => tokens.push(Token::FunctionIdentifier(Ln)),
|
|
"exp" => tokens.push(Token::FunctionIdentifier(Exp)),
|
|
|
|
"round" | "rint" => tokens.push(Token::FunctionIdentifier(Round)),
|
|
"ceil" => tokens.push(Token::FunctionIdentifier(Ceil)),
|
|
"floor" => tokens.push(Token::FunctionIdentifier(Floor)),
|
|
"abs" | "fabs" => tokens.push(Token::FunctionIdentifier(Abs)),
|
|
|
|
"sin" => tokens.push(Token::FunctionIdentifier(Sin)),
|
|
"cos" => tokens.push(Token::FunctionIdentifier(Cos)),
|
|
"tan" => tokens.push(Token::FunctionIdentifier(Tan)),
|
|
|
|
"ns" | "nanosecond" | "nanoseconds" => tokens.push(Token::Unit(Nanosecond)),
|
|
"μs" | "us" | "microsecond" | "microseconds" => tokens.push(Token::Unit(Microsecond)),
|
|
"ms" | "millisecond" | "milliseconds" => tokens.push(Token::Unit(Millisecond)),
|
|
"s" | "sec" | "second" | "seconds" => tokens.push(Token::Unit(Second)),
|
|
"min" | "minute" | "minutes" => tokens.push(Token::Unit(Minute)),
|
|
"h" | "hour" | "hours" => tokens.push(Token::Unit(Hour)),
|
|
"day" | "days" => tokens.push(Token::Unit(Day)),
|
|
"week" | "weeks" => tokens.push(Token::Unit(Week)),
|
|
"mo" | "month" | "months" => tokens.push(Token::Unit(Month)),
|
|
"q" | "quater" | "quaters" => tokens.push(Token::Unit(Month)),
|
|
"yr" | "year" | "years" => tokens.push(Token::Unit(Year)),
|
|
"decade" | "decades" => tokens.push(Token::Unit(Decade)),
|
|
"century" | "centuries" => tokens.push(Token::Unit(Century)),
|
|
"millenium" | "millenia" | "milleniums" => tokens.push(Token::Unit(Millenium)),
|
|
|
|
"mm" | "millimeter" | "millimeters" => tokens.push(Token::Unit(Millimeter)),
|
|
"cm" | "centimeter" | "centimeters" => tokens.push(Token::Unit(Centimeter)),
|
|
"dm" | "decimeter" | "decimeters" => tokens.push(Token::Unit(Centimeter)),
|
|
"m" | "meter" | "meters" => tokens.push(Token::Unit(Meter)),
|
|
"km" | "kilometer" | "kilometers" => tokens.push(Token::Unit(Kilometer)),
|
|
"in" | "inch" | "inches" => tokens.push(Token::Unit(Inch)),
|
|
"ft" | "foot" | "feet" => tokens.push(Token::Unit(Foot)),
|
|
"yd" | "yard" | "yards" => tokens.push(Token::Unit(Yard)),
|
|
"mi" | "mile" | "miles" => tokens.push(Token::Unit(Mile)),
|
|
"nmi" => tokens.push(Token::Unit(NauticalMile)),
|
|
|
|
// two word unit
|
|
"nautical" | "square" | "cubic" => {
|
|
// skip past whitespace
|
|
if let Some((_index, current_char)) = chars.peek() {
|
|
if current_char.is_whitespace() {
|
|
byte_index += current_char.len_utf8();
|
|
chars.next();
|
|
}
|
|
}
|
|
// prevent off-by-one error causing string to be " mile"
|
|
byte_index += current_char.len_utf8();
|
|
chars.next();
|
|
|
|
let start_index = byte_index;
|
|
let mut end_index = byte_index;
|
|
while let Some((_index, current_char)) = chars.peek() {
|
|
// don't loop more than max_word_length:
|
|
if end_index >= start_index + max_word_length - 1 { break; }
|
|
|
|
if current_char.is_alphabetic() {
|
|
byte_index += current_char.len_utf8();
|
|
end_index += 1;
|
|
chars.next();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let second_string = &input[start_index..=end_index];
|
|
let full_string = format!("{} {}", string, second_string);
|
|
match full_string.as_str() {
|
|
"nautical mile" => tokens.push(Token::Unit(NauticalMile)),
|
|
|
|
"square meter" | "square meters" => tokens.push(Token::Unit(SquareMeter)),
|
|
|
|
"cubic meter" | "cubic meters" => tokens.push(Token::Unit(CubicMeter)),
|
|
|
|
_ => {
|
|
return Err(format!("Invalid string: {}", string));
|
|
}
|
|
}
|
|
}
|
|
|
|
_ => {
|
|
return Err(format!("Invalid string: {}", string));
|
|
}
|
|
}
|
|
|
|
},
|
|
'.' | '0'..='9' => {
|
|
|
|
let start_index = byte_index;
|
|
let mut end_index = byte_index;
|
|
while let Some((_index, current_char)) = chars.peek() {
|
|
if current_char == &'.' || current_char.is_digit(10) {
|
|
byte_index += current_char.len_utf8();
|
|
chars.next();
|
|
end_index += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let number_string = &input[start_index..=end_index];
|
|
match d128::from_str(number_string) {
|
|
Ok(number) => {
|
|
if d128::get_status().is_empty() {
|
|
tokens.push(Token::Number(number));
|
|
} else {
|
|
return Err(format!("Error parsing d128 number: {}", number_string));
|
|
}
|
|
},
|
|
Err(_e) => {
|
|
return Err(format!("Error parsing d128 number: {}", number_string));
|
|
}
|
|
};
|
|
|
|
},
|
|
_ => {
|
|
return Err(format!("Invalid character: {}", current_char));
|
|
},
|
|
}
|
|
// The π character, for example, is more than one byte, so in that case
|
|
// byte_index needs to be incremented by 2. This is because we're slicing
|
|
// strings to get digits/words, and Rust slices bytes, not utf8 graphemes
|
|
// (aka "user-perceived characters").
|
|
byte_index += current_char.len_utf8();
|
|
};
|
|
|
|
// auto insert missing parentheses in first and last position
|
|
if left_paren_count > right_paren_count {
|
|
let missing_right_parens = left_paren_count - right_paren_count;
|
|
for _ in 0..missing_right_parens {
|
|
tokens.push(Token::Operator(RightParen));
|
|
}
|
|
} else if left_paren_count < right_paren_count {
|
|
let missing_left_parens = right_paren_count - left_paren_count;
|
|
for _ in 0..missing_left_parens {
|
|
tokens.insert(0, Token::Operator(LeftParen));
|
|
}
|
|
}
|
|
|
|
// wrap in parentheses acting as start and end for parsing.
|
|
tokens.push(Token::Operator(RightParen));
|
|
tokens.insert(0, Token::Operator(LeftParen));
|
|
|
|
// the lexer parses percentages as modulo, so here modulos become percentages
|
|
let mut token_index = 0;
|
|
for _i in 1..tokens.len() {
|
|
match tokens[token_index] {
|
|
Token::Operator(Modulo) => {
|
|
match &tokens[token_index + 1] {
|
|
Token::TextOperator(Of) => {
|
|
// for example "10% of 1km" should be a percentage, not modulo
|
|
tokens[token_index] = Token::UnaryOperator(Percent);
|
|
},
|
|
Token::Operator(operator) => {
|
|
match operator {
|
|
LeftParen => {},
|
|
_ => {
|
|
// for example "10%*2" should be a percentage, but "10%(2)" should be modulo
|
|
tokens[token_index] = Token::UnaryOperator(Percent);
|
|
}
|
|
}
|
|
},
|
|
Token::UnaryOperator(_operator) => {
|
|
// for example "10%!" should be a percentage, but "10%(2)" should be modulo
|
|
tokens[token_index] = Token::UnaryOperator(Percent);
|
|
},
|
|
_ => {},
|
|
}
|
|
}
|
|
_ => {},
|
|
}
|
|
token_index += 1;
|
|
}
|
|
|
|
Ok(tokens)
|
|
}
|