Fix multi-word parsing, revamp lexer to be recursive

This commit is contained in:
Kasper 2021-07-05 17:03:17 +02:00
parent 31077f669e
commit eaaee87b2c

View File

@ -54,8 +54,9 @@ pub fn read_word_plain(chars: &mut Peekable<Graphemes>) -> String {
/// Read next as a word, otherwise return empty string. /// Read next as a word, otherwise return empty string.
/// Leading whitespace is ignored. A trailing digit may be included. /// Leading whitespace is ignored. A trailing digit may be included.
pub fn read_word(chars: &mut Peekable<Graphemes>) -> String { pub fn read_word(first_c: &str, lexer: &mut Lexer) -> String {
// skip whitespace // skip whitespace
let chars = &mut lexer.chars;
while let Some(current_char) = chars.peek() { while let Some(current_char) = chars.peek() {
if current_char.trim().is_empty() { if current_char.trim().is_empty() {
chars.next(); chars.next();
@ -63,7 +64,7 @@ pub fn read_word(chars: &mut Peekable<Graphemes>) -> String {
break; break;
} }
} }
let mut word = String::new(); let mut word = first_c.to_owned();
while let Some(next_char) = chars.peek() { while let Some(next_char) = chars.peek() {
if is_alphabetic_extended_str(&next_char) { if is_alphabetic_extended_str(&next_char) {
word += chars.next().unwrap(); word += chars.next().unwrap();
@ -85,8 +86,68 @@ pub fn read_word(chars: &mut Peekable<Graphemes>) -> String {
return word; return word;
} }
pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, default_degree: Unit) -> Result<(), String> { pub fn parse_token(c: &str, lexer: &mut Lexer) -> Result<(), String> {
let token = match read_word(chars).as_str() { let tokens = &mut lexer.tokens;
match c {
value if value.trim().is_empty() => {},
value if is_alphabetic_extended_str(&value) => {
parse_word(read_word(c, lexer).as_str(), lexer)?;
},
value if is_numeric_str(value) => {
let mut number_string = value.to_owned();
while let Some(number_char) = lexer.chars.peek() {
if is_numeric_str(number_char) {
number_string += number_char;
lexer.chars.next();
} else {
break;
}
}
d128::set_status(decimal::Status::empty());
match d128::from_str(&number_string) {
Ok(number) => {
if d128::get_status().is_empty() {
tokens.push(Token::Number(number));
} else {
return Err(format!("Error lexing d128 number: {}", number_string));
}
},
Err(_e) => {
return Err(format!("Error lexing d128 number: {}", number_string));
}
};
},
"+" => tokens.push(Token::Operator(Plus)),
"-" => tokens.push(Token::Operator(Minus)),
"*" => tokens.push(Token::Operator(Multiply)),
"/" => tokens.push(Token::Operator(Divide)),
"%" => tokens.push(Token::LexerKeyword(PercentChar)),
"^" => tokens.push(Token::Operator(Caret)),
"!" => tokens.push(Token::UnaryOperator(Factorial)),
"(" => {
// left_paren_count += 1;
tokens.push(Token::Operator(LeftParen));
},
")" => {
// right_paren_count += 1;
tokens.push(Token::Operator(RightParen));
},
"π" => tokens.push(Token::Constant(Pi)),
"'" => tokens.push(Token::Unit(Foot)),
"\"" | "" | "" | "" => tokens.push(Token::LexerKeyword(DoubleQuotes)),
"Ω" | "" => tokens.push(Token::Unit(Ohm)),
_ => {
return Err(format!("Invalid character: {}", c));
},
}
if let Some(next_c) = lexer.chars.next() {
parse_token(next_c, lexer)?;
}
Ok(())
}
pub fn parse_word(word: &str, lexer: &mut Lexer) -> Result<(), String> {
let token = match word {
"to" => Token::TextOperator(To), "to" => Token::TextOperator(To),
"of" => Token::TextOperator(Of), "of" => Token::TextOperator(Of),
@ -167,7 +228,7 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"mi" | "mile" | "miles" => Token::Unit(Mile), "mi" | "mile" | "miles" => Token::Unit(Mile),
"nmi" => Token::Unit(NauticalMile), "nmi" => Token::Unit(NauticalMile),
"nautical" => { "nautical" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"mile" | "miles" => Token::Unit(NauticalMile), "mile" | "miles" => Token::Unit(NauticalMile),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
@ -175,7 +236,7 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"ly" | "lightyear" | "lightyears" => Token::Unit(LightYear), "ly" | "lightyear" | "lightyears" => Token::Unit(LightYear),
"lightsec" | "lightsecs" | "lightsecond" | "lightseconds" => Token::Unit(LightSecond), "lightsec" | "lightsecs" | "lightsecond" | "lightseconds" => Token::Unit(LightSecond),
"light" => { "light" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"yr" | "yrs" | "year" | "years" => Token::Unit(LightYear), "yr" | "yrs" | "year" | "years" => Token::Unit(LightYear),
"sec" | "secs" | "second" | "seconds" => Token::Unit(LightSecond), "sec" | "secs" | "second" | "seconds" => Token::Unit(LightSecond),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
@ -192,7 +253,7 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"sqyd" | "yd2" | "yard2" | "yards2" => Token::Unit(SquareYard), "sqyd" | "yd2" | "yard2" | "yards2" => Token::Unit(SquareYard),
"sqmi" | "mi2" | "mile2" | "miles2" => Token::Unit(SquareMile), "sqmi" | "mi2" | "mile2" | "miles2" => Token::Unit(SquareMile),
"sq" | "square" => { "sq" | "square" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"mm" | "millimeter" | "millimeters" | "millimetre" | "millimetres" => Token::Unit(SquareMillimeter), "mm" | "millimeter" | "millimeters" | "millimetre" | "millimetres" => Token::Unit(SquareMillimeter),
"cm" | "centimeter" | "centimeters" | "centimetre" | "centimetres" => Token::Unit(SquareCentimeter), "cm" | "centimeter" | "centimeters" | "centimetre" | "centimetres" => Token::Unit(SquareCentimeter),
"dm" | "decimeter" | "decimeters" | "decimetre" | "decimetres" => Token::Unit(SquareDecimeter), "dm" | "decimeter" | "decimeters" | "decimetre" | "decimetres" => Token::Unit(SquareDecimeter),
@ -220,7 +281,7 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"yd3" | "yard3" | "yards3" => Token::Unit(CubicYard), "yd3" | "yard3" | "yards3" => Token::Unit(CubicYard),
"mi3" | "mile3" | "miles3" => Token::Unit(CubicMile), "mi3" | "mile3" | "miles3" => Token::Unit(CubicMile),
"cubic" => { "cubic" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"mm" | "millimeter" | "millimeters" | "millimetre" | "millimetres" => Token::Unit(CubicMillimeter), "mm" | "millimeter" | "millimeters" | "millimetre" | "millimetres" => Token::Unit(CubicMillimeter),
"cm" | "centimeter" | "centimeters" | "centimetre" | "centimetres" => Token::Unit(CubicCentimeter), "cm" | "centimeter" | "centimeters" | "centimetre" | "centimetres" => Token::Unit(CubicCentimeter),
"dm" | "decimeter" | "decimeters" | "decimetre" | "decimetres" => Token::Unit(CubicDecimeter), "dm" | "decimeter" | "decimeters" | "decimetre" | "decimetres" => Token::Unit(CubicDecimeter),
@ -241,7 +302,7 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"tbs" | "tbsp" | "tablespoon" | "tablespoons" => Token::Unit(Tablespoon), "tbs" | "tbsp" | "tablespoon" | "tablespoons" => Token::Unit(Tablespoon),
"floz" => Token::Unit(FluidOunce), "floz" => Token::Unit(FluidOunce),
"fl" | "fluid" => { "fl" | "fluid" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"oz" | "ounce" | "ounces" => Token::Unit(FluidOunce), "oz" | "ounce" | "ounces" => Token::Unit(FluidOunce),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
@ -252,14 +313,14 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"gal" | "gallon" | "gallons" => Token::Unit(Gallon), "gal" | "gallon" | "gallons" => Token::Unit(Gallon),
"bbl" | "oil barrel" | "oil barrels" => Token::Unit(OilBarrel), "bbl" | "oil barrel" | "oil barrels" => Token::Unit(OilBarrel),
"oil" => { "oil" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"barrel" | "barrels" => Token::Unit(OilBarrel), "barrel" | "barrels" => Token::Unit(OilBarrel),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
}, },
"metric" => { "metric" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"ton" | "tons" | "tonne" | "tonnes" => Token::Unit(MetricTon), "ton" | "tons" | "tonne" | "tonnes" => Token::Unit(MetricTon),
"hp" | "hps" | "horsepower" | "horsepowers" => Token::Unit(MetricHorsepower), "hp" | "hps" | "horsepower" | "horsepowers" => Token::Unit(MetricHorsepower),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
@ -299,14 +360,14 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"stone" | "stones" => Token::Unit(Stone), "stone" | "stones" => Token::Unit(Stone),
"st" | "ton" | "tons" => Token::Unit(ShortTon), "st" | "ton" | "tons" => Token::Unit(ShortTon),
"short" => { "short" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"ton" | "tons" | "tonne" | "tonnes" => Token::Unit(ShortTon), "ton" | "tons" | "tonne" | "tonnes" => Token::Unit(ShortTon),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
}, },
"lt" => Token::Unit(LongTon), "lt" => Token::Unit(LongTon),
"long" => { "long" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"ton" | "tons" | "tonne" | "tonnes" => Token::Unit(LongTon), "ton" | "tons" | "tonne" | "tonnes" => Token::Unit(LongTon),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
@ -363,9 +424,9 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"kcal" | "kilocalorie" | "kilocalories" => Token::Unit(KiloCalorie), "kcal" | "kilocalorie" | "kilocalories" => Token::Unit(KiloCalorie),
"btu" => Token::Unit(BritishThermalUnit), "btu" => Token::Unit(BritishThermalUnit),
"british" => { "british" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"thermal" => { "thermal" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"unit" | "units" => Token::Unit(BritishThermalUnit), "unit" | "units" => Token::Unit(BritishThermalUnit),
string => return Err(format!("Invalid string: {}", string)), string => return Err(format!("Invalid string: {}", string)),
} }
@ -391,39 +452,47 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"mhp" | "hpm" => Token::Unit(MetricHorsepower), "mhp" | "hpm" => Token::Unit(MetricHorsepower),
"watt" => { "watt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(WattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(WattHour),
_ => Token::Unit(Watt) _ => Token::Unit(Watt)
} }
} }
"kilowatt" => { "kilowatt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(KilowattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(KilowattHour),
_ => Token::Unit(Kilowatt), _ => Token::Unit(Kilowatt),
} }
} }
"megawatt" => { "megawatt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(MegawattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(MegawattHour),
_ => Token::Unit(Megawatt), _ => Token::Unit(Megawatt),
} }
} }
"gigawatt" => { "gigawatt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(GigawattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(GigawattHour),
_ => Token::Unit(Gigawatt), _ => Token::Unit(Gigawatt),
} }
} }
"terawatt" => { "terawatt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(TerawattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(TerawattHour),
_ => Token::Unit(Terawatt), other => {
lexer.tokens.push(Token::Unit(Watt));
parse_token(other, lexer)?;
return Ok(());
}
} }
} }
"petawatt" => { "petawatt" => {
match read_word(chars).as_str() { match read_word("", lexer).as_str() {
"hr" | "hrs" | "hour" | "hours" => Token::Unit(PetawattHour), "hr" | "hrs" | "hour" | "hours" => Token::Unit(PetawattHour),
_ => Token::Unit(Petawatt), other => {
lexer.tokens.push(Token::Unit(Watt));
parse_token(other, lexer)?;
return Ok(());
}
} }
} }
@ -471,16 +540,24 @@ pub fn parse_word(tokens: &mut Vec<Token>, chars: &mut Peekable<Graphemes>, defa
"k" | "kelvin" | "kelvins" => Token::Unit(Kelvin), "k" | "kelvin" | "kelvins" => Token::Unit(Kelvin),
"c" | "celsius" => Token::Unit(Celsius), "c" | "celsius" => Token::Unit(Celsius),
"f" | "fahrenheit" | "fahrenheits" => Token::Unit(Fahrenheit), "f" | "fahrenheit" | "fahrenheits" => Token::Unit(Fahrenheit),
"deg" | "degree" | "degrees" => Token::Unit(default_degree), "deg" | "degree" | "degrees" => Token::Unit(lexer.default_degree),
string => { string => {
return Err(format!("Invalid string: {}", string)); return Err(format!("Invalid string: {}", string));
} }
}; };
tokens.push(token); lexer.tokens.push(token);
return Ok(()); return Ok(());
} }
pub struct Lexer<'a> {
left_paren_count: u16,
right_paren_count: u16,
chars: Peekable<Graphemes<'a>>,
tokens: Vec<Token>,
default_degree: Unit,
}
/// Lex an input string and returns [`Token`]s /// Lex an input string and returns [`Token`]s
pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> Result<Vec<Token>, String> { pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> Result<Vec<Token>, String> {
@ -497,85 +574,26 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) ->
} }
} }
let mut left_paren_count = 0; let mut lexer = Lexer {
let mut right_paren_count = 0; left_paren_count: 0,
right_paren_count: 0,
chars: UnicodeSegmentation::graphemes(input.as_str(), true).peekable(),
tokens: Vec::new(),
default_degree,
};
let mut chars = UnicodeSegmentation::graphemes(input.as_str(), true).peekable(); if let Some(c) = lexer.chars.next() {
let mut tokens: Vec<Token> = vec![]; parse_token(c, &mut lexer)?;
while let Some(_) = chars.peek() {
let current_char = chars.peek().unwrap();
println!("1: {}", current_char);
let token = match *current_char {
value if value.trim().is_empty() => {
chars.next();
continue;
},
value if is_alphabetic_extended_str(&value) => {
parse_word(&mut tokens, &mut chars, default_degree)?;
continue;
},
value if is_numeric_str(value) => {
let mut number_string = String::new();
while let Some(number_char) = chars.peek() {
if is_numeric_str(number_char) {
number_string += number_char;
chars.next();
} else {
break;
}
}
d128::set_status(decimal::Status::empty());
let token;
match d128::from_str(&number_string) {
Ok(number) => {
if d128::get_status().is_empty() {
token = Token::Number(number);
} else {
return Err(format!("Error lexing d128 number: {}", number_string));
}
},
Err(_e) => {
return Err(format!("Error lexing d128 number: {}", number_string));
}
};
token
},
"+" => Token::Operator(Plus),
"-" => Token::Operator(Minus),
"*" => Token::Operator(Multiply),
"/" => Token::Operator(Divide),
"%" => Token::LexerKeyword(PercentChar),
"^" => Token::Operator(Caret),
"!" => Token::UnaryOperator(Factorial),
"(" => {
left_paren_count += 1;
Token::Operator(LeftParen)
},
")" => {
right_paren_count += 1;
Token::Operator(RightParen)
},
"π" => Token::Constant(Pi),
"'" => Token::Unit(Foot),
"\"" | "" | "" | "" => Token::LexerKeyword(DoubleQuotes),
"Ω" | "" => Token::Unit(Ohm),
_ => {
return Err(format!("Invalid character: {}", current_char));
},
};
chars.next();
tokens.push(token);
} }
let tokens = &mut lexer.tokens;
// auto insert missing parentheses in first and last position // auto insert missing parentheses in first and last position
if left_paren_count > right_paren_count { if lexer.left_paren_count > lexer.right_paren_count {
let missing_right_parens = left_paren_count - right_paren_count; let missing_right_parens = lexer.left_paren_count - lexer.right_paren_count;
for _ in 0..missing_right_parens { for _ in 0..missing_right_parens {
tokens.push(Token::Operator(RightParen)); tokens.push(Token::Operator(RightParen));
} }
} else if left_paren_count < right_paren_count { } else if lexer.left_paren_count < lexer.right_paren_count {
let missing_left_parens = right_paren_count - left_paren_count; let missing_left_parens = lexer.right_paren_count - lexer.left_paren_count;
for _ in 0..missing_left_parens { for _ in 0..missing_left_parens {
tokens.insert(0, Token::Operator(LeftParen)); tokens.insert(0, Token::Operator(LeftParen));
} }
@ -719,7 +737,7 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) ->
} }
} }
Ok(tokens) Ok(lexer.tokens)
} }
#[cfg(test)] #[cfg(test)]