Lexer supports multi-byte utf8 characters and "pi", "π" and "e"

2019-12-09 03:36:02 +01:00 · 2019-12-09 03:36:02 +01:00 · 424e6988e9
commit 424e6988e9
parent 1fe9214a62
4 changed files with 55 additions and 9 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -29,6 +29,7 @@ version = "0.1.0"
 dependencies = [
 "decimal 2.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
 "nom 5.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-segmentation 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -127,6 +128,11 @@ name = "static_assertions"
 version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"

+[[package]]
+name = "unicode-segmentation"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "version_check"
 version = "0.1.5"
@ -151,4 +157,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
 "checksum serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)" = "1217f97ab8e8904b57dd22eb61cde455fa7446a9c1cf43966066da047c1f3702"
 "checksum static_assertions 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
+"checksum unicode-segmentation 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
 "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -6,4 +6,5 @@ edition = "2018"

 [dependencies]
 nom = "5.0"
+unicode-segmentation = "1.6.0"
 decimal = "2.0.4"
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -1,13 +1,16 @@
 use std::str::FromStr;
 use decimal::d128;
-use crate::{Token, TokenVector, Operator::*};
+use crate::{Token, TokenVector};
+use crate::Operator::{Caret, Divide, Factorial, LeftParen, Minus, Multiply, PercentOrModulo, Plus, RightParen};
+use crate::Constant::{Pi, EulersNumber};

 pub fn lex(input: &str) -> Result<TokenVector, String> {

  let mut chars = input.chars().enumerate().peekable();
  let mut tokens: TokenVector = vec![];
  
-  while let Some((index, current_char)) = chars.next() {
+  let mut byte_index = 0;
+  while let Some((_index, current_char)) = chars.next() {
    match current_char {
      '+' => tokens.push(Token::Operator(Plus)),
      '-' => tokens.push(Token::Operator(Minus)),
@ -18,16 +21,37 @@ pub fn lex(input: &str) -> Result<TokenVector, String> {
      '!' => tokens.push(Token::Operator(Factorial)),
      '(' => tokens.push(Token::Operator(LeftParen)),
      ')' => tokens.push(Token::Operator(RightParen)),
+      'π' => tokens.push(Token::Constant(Pi)),
      ',' => continue,
      value if value.is_whitespace() => continue,
      value if value.is_alphabetic() => {

+        let start_index = byte_index;
+        let mut end_index = byte_index;
+        while let Some((_index, current_char)) = chars.peek() {
+          if current_char.is_alphabetic() {
+            chars.next();
+            end_index += 1;
+          } else {
+            break;
+          }
+        }
+
+        let string = &input[start_index..=end_index];
+        match string {
+          "pi" => tokens.push(Token::Constant(Pi)),
+          "e" => tokens.push(Token::Constant(EulersNumber)),
+          _ => {
+            return Err(format!("Invalid string: {}", string))
+          }
+        }
+        
      },
      '.' | '0'..='9' => {

-        let start_index = index;
-        let mut end_index = index+1;
-        while let Some((_idk, current_char)) = chars.peek() {
+        let start_index = byte_index;
+        let mut end_index = byte_index;
+        while let Some((_index, current_char)) = chars.peek() {
          if current_char == &'.' || current_char.is_digit(10) {
            chars.next();
            end_index += 1;
@ -36,24 +60,30 @@ pub fn lex(input: &str) -> Result<TokenVector, String> {
          }
        }
        
-        match d128::from_str(&input[start_index..end_index]) {
+        let number_string = &input[start_index..=end_index];
+        match d128::from_str(number_string) {
          Ok(number) => {
            if d128::get_status().is_empty() {
              tokens.push(Token::Number(number));
            } else {
-              return Err(format!("Error parsing d128 number: {}", &input[start_index..end_index]));
+              return Err(format!("Error parsing d128 number: {}", number_string));
            }
          },
          Err(_e) => {
-            return Err("Error parsing d128 number (This should not happen because d128 does not throw errors)".to_owned());
+            return Err(format!("Error parsing d128 number: {}", number_string));
          }
        };

      },
      _ => {
-        return Err(format!("Unknown character: {}", current_char));
+        return Err(format!("Invalid character: {}", current_char));
      },
    }
+    // The π character, for example, is more than one byte, so in that case
+    // byte_index needs to be incremented by 2. This is because we're slicing
+    // strings to get digits/words, and Rust slices bytes, not utf8 graphemes
+    // (aka "user-perceived characters").
+    byte_index += current_char.len_utf8();
  };
  return Ok(tokens)
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -14,10 +14,18 @@ pub enum Operator {
  RightParen,
 }

+#[derive(Debug)]
+pub enum Constant {
+  Pi,
+  EulersNumber,
+}
+
 #[derive(Debug)]
 pub enum Token {
  Operator(Operator),
  Number(d128),
+  Function(String),
+  Constant(Constant),
 }

 pub type TokenVector = Vec<Token>;