From 50cac79d7a186a97b04966ab4c5b368362e464a7 Mon Sep 17 00:00:00 2001 From: Kasper Date: Sat, 21 Nov 2020 02:10:20 +0100 Subject: [PATCH] =?UTF-8?q?Fixed=20lexing=20of=20"=C2=B5s"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lexer.rs | 21 +++++++++++++-------- src/units.rs | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index adf480f..e8a5975 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -11,6 +11,13 @@ use crate::FunctionIdentifier::{Cbrt, Ceil, Cos, Exp, Abs, Floor, Ln, Log, Round use crate::units::Unit; use crate::units::Unit::*; +pub const fn is_alphabetic_extended(input: &char) -> bool { + match input { + 'A'..='Z' | 'a'..='z' | 'Ω' | 'µ' | 'μ' | 'π' => true, + _ => false, + } +} + /// Lex an input string and return a [`TokenVector`](../type.TokenVector.html) pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> Result { @@ -55,9 +62,10 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> '"' | '“' | '”' | '″' => tokens.push(Token::LexerKeyword(DoubleQuotes)), value if value.is_whitespace() => {}, 'Ω' => tokens.push(Token::Unit(Ohm)), - value if value.is_ascii_alphabetic() => { + value if is_alphabetic_extended(&value) => { let start_index = byte_index; - let mut end_index = byte_index; + // account for chars longer than one byte + let mut end_index = byte_index + current_char.len_utf8() - 1; while let Some(current_char) = chars.peek() { // don't loop more than max_word_length: if end_index >= start_index + max_word_length - 1 { @@ -65,11 +73,7 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> return Err(format!("Invalid string starting with: {}", string)); } - if current_char.is_ascii_alphabetic() { - byte_index += current_char.len_utf8(); - end_index += 1; - chars.next(); - } else if current_char == &'Ω' { + if is_alphabetic_extended(¤t_char) { byte_index += current_char.len_utf8(); end_index += current_char.len_utf8(); chars.next(); @@ -185,7 +189,8 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> "hg" => tokens.push(Token::LexerKeyword(Hg)), // can be hectogram or mercury "ns" | "nanosec" | "nanosecs" | "nanosecond" | "nanoseconds" => tokens.push(Token::Unit(Nanosecond)), - "μs" | "microsec" | "microsecs" | "microsecond" | "microseconds" => tokens.push(Token::Unit(Microsecond)), + // µ and μ are two different characters + "µs" | "μs" | "microsec" | "microsecs" | "microsecond" | "microseconds" => tokens.push(Token::Unit(Microsecond)), "ms" | "millisec" | "millisecs" | "millisecond" | "milliseconds" => tokens.push(Token::Unit(Millisecond)), "s" | "sec" | "secs" | "second" | "seconds" => tokens.push(Token::Unit(Second)), "min" | "mins" | "minute" | "minutes" => tokens.push(Token::Unit(Minute)), diff --git a/src/units.rs b/src/units.rs index bb03fc7..48a3ef6 100644 --- a/src/units.rs +++ b/src/units.rs @@ -679,7 +679,7 @@ mod tests { assert_eq!(convert_test(1000.0, Milliampere, Ampere), 1.0); assert_eq!(convert_test(1000.0, Ampere, Kiloampere), 1.0); - assert_eq!(convert_test(10.0, Ampere, Biot), 1.0); + assert_eq!(convert_test(10.0, Ampere, Abampere), 1.0); assert_eq!(convert_test(1000.0, Pascal, Kilopascal), 1.0); assert_eq!(convert_test(101325.0, Pascal, Atmosphere), 1.0);