From 50cac79d7a186a97b04966ab4c5b368362e464a7 Mon Sep 17 00:00:00 2001
From: Kasper <kasperkh.kh@gmail.com>
Date: Sat, 21 Nov 2020 02:10:20 +0100
Subject: [PATCH] =?UTF-8?q?Fixed=20lexing=20of=20"=C2=B5s"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/lexer.rs | 21 +++++++++++++--------
 src/units.rs |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/lexer.rs b/src/lexer.rs
index adf480f..e8a5975 100644
--- a/src/lexer.rs
+++ b/src/lexer.rs
@@ -11,6 +11,13 @@ use crate::FunctionIdentifier::{Cbrt, Ceil, Cos, Exp, Abs, Floor, Ln, Log, Round
 use crate::units::Unit;
 use crate::units::Unit::*;
 
+pub const fn is_alphabetic_extended(input: &char) -> bool {
+  match input {
+    'A'..='Z' | 'a'..='z' | 'Ω' | 'µ' | 'μ' | 'π' => true,
+    _ => false,
+  }
+}
+
 /// Lex an input string and return a [`TokenVector`](../type.TokenVector.html)
 pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) -> Result<TokenVector, String> {
 
@@ -55,9 +62,10 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) ->
       '"' | '“' | '”' | '″' => tokens.push(Token::LexerKeyword(DoubleQuotes)),
       value if value.is_whitespace() => {},
       'Ω' => tokens.push(Token::Unit(Ohm)),
-      value if value.is_ascii_alphabetic() => {
+      value if is_alphabetic_extended(&value) => {
         let start_index = byte_index;
-        let mut end_index = byte_index;
+        // account for chars longer than one byte
+        let mut end_index = byte_index + current_char.len_utf8() - 1;
         while let Some(current_char) = chars.peek() {
           // don't loop more than max_word_length:
           if end_index >= start_index + max_word_length - 1 {
@@ -65,11 +73,7 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) ->
             return Err(format!("Invalid string starting with: {}", string));
           }
 
-          if current_char.is_ascii_alphabetic() {
-            byte_index += current_char.len_utf8();
-            end_index += 1;
-            chars.next();
-          } else if current_char == &'Ω' {
+          if is_alphabetic_extended(&current_char) {
             byte_index += current_char.len_utf8();
             end_index += current_char.len_utf8();
             chars.next();
@@ -185,7 +189,8 @@ pub fn lex(input: &str, allow_trailing_operators: bool, default_degree: Unit) ->
             "hg" => tokens.push(Token::LexerKeyword(Hg)), // can be hectogram or mercury
 
             "ns" | "nanosec" | "nanosecs" | "nanosecond" | "nanoseconds" => tokens.push(Token::Unit(Nanosecond)),
-            "μs" | "microsec" | "microsecs" | "microsecond" | "microseconds" => tokens.push(Token::Unit(Microsecond)),
+            // µ and μ are two different characters
+            "µs" | "μs" | "microsec" | "microsecs" | "microsecond" | "microseconds" => tokens.push(Token::Unit(Microsecond)),
             "ms" | "millisec" | "millisecs" | "millisecond" | "milliseconds" => tokens.push(Token::Unit(Millisecond)),
             "s" | "sec" | "secs" | "second" | "seconds" => tokens.push(Token::Unit(Second)),
             "min" | "mins" | "minute" | "minutes" => tokens.push(Token::Unit(Minute)),
diff --git a/src/units.rs b/src/units.rs
index bb03fc7..48a3ef6 100644
--- a/src/units.rs
+++ b/src/units.rs
@@ -679,7 +679,7 @@ mod tests {
 
     assert_eq!(convert_test(1000.0, Milliampere, Ampere), 1.0);
     assert_eq!(convert_test(1000.0, Ampere, Kiloampere), 1.0);
-    assert_eq!(convert_test(10.0, Ampere, Biot), 1.0);
+    assert_eq!(convert_test(10.0, Ampere, Abampere), 1.0);
 
     assert_eq!(convert_test(1000.0, Pascal, Kilopascal), 1.0);
     assert_eq!(convert_test(101325.0, Pascal, Atmosphere), 1.0);