sts10 · sts10 · Oct 27, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "phraze"
 description = "Passphrase generator"
-version = "0.2.1"
+version = "0.3.0"
 edition = "2021"
 license = "MPL-2.0"
 readme = "readme.markdown"
@@ -10,6 +10,7 @@ authors = ["sts10 <sschlinkert@gmail.com>"]
 [dependencies]
 rand = "0.8.5"
 clap = { version = "4.4.7", features = ["derive"] }
+unicode-normalization = "0.1.22"
 
 [dev-dependencies]
 criterion = "0.5.1"

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,4 @@
+pub mod unicode_normalization_check;
 use rand::{seq::SliceRandom, thread_rng, Rng};
 
 // Pull in the wordlists as constants for us to use later.
@@ -78,11 +79,14 @@ pub fn fetch_list(list_choice: List) -> &'static [&'static str] {
 }
 
 /// Actually generate the passphrase, given a couple neccessary parameters.
-pub fn generate_passphrase(
+/// This function uses some Rust magic to be able to accept a word list as
+/// either a &[&str] (built-in word lists) or as a &[String] if user provides a file
+/// as word list.
+pub fn generate_passphrase<T: AsRef<str> + std::fmt::Display>(
     number_of_words_to_put_in_passphrase: usize,
     separator: &str,
     title_case: bool,
-    list: &'static [&'static str],
+    list: &[T], // Either type!
 ) -> String {
     let mut rng = thread_rng();
     // Create a blank String to put words into to create our passphrase
@@ -139,7 +143,10 @@ fn get_random_number(rng: &mut impl Rng) -> String {
 
 /// Give an array of words, pick a random element and make it a String for
 /// simplicity's sake.
-fn get_random_element(rng: &mut impl Rng, word_list: &[&str]) -> String {
+fn get_random_element<T: AsRef<str>>(rng: &mut impl Rng, word_list: &[T]) -> String
+where
+    T: std::fmt::Display,
+{
     match word_list.choose(rng) {
         Some(word) => word.to_string(),
         None => panic!("Couldn't pick a random word"),

diff --git a/src/main.rs b/src/main.rs
@@ -1,5 +1,13 @@
+use crate::unicode_normalization_check::uniform_unicode_normalization;
 use clap::Parser;
 use phraze::*;
+use std::fs::File;
+use std::io;
+use std::io::BufRead;
+use std::io::BufReader;
+use std::path::Path;
+use std::path::PathBuf;
+use std::str::FromStr;
 
 /// Generate random passphrases
 #[derive(Parser, Debug)]
@@ -69,6 +77,12 @@ struct Args {
     #[clap(short = 'l', long = "list", value_parser=parse_list_choice, default_value="m")]
     list_choice: List,
 
+    /// Provide a text file with a list of words to randomly generate passphrase from.
+    ///
+    /// Should be a text file with one per line.
+    #[clap(short = 'c', long = "custom-list", conflicts_with = "list_choice")]
+    custom_list_file_path: Option<PathBuf>,
+
     /// Use Title Case for words in generated usernames
     #[clap(short = 't', long = "title-case")]
     title_case: bool,
@@ -81,16 +95,35 @@ struct Args {
 fn main() {
     let opt = Args::parse();
 
-    // Fetch requested word list
-    let list = fetch_list(opt.list_choice);
+    if opt.custom_list_file_path.is_some() && opt.separator.is_empty() && !opt.title_case {
+        panic!("Must use a separator or title case when using a custom word list");
+    }
+
+    // We need two different variables here, one for a user-inputted list and another for
+    // the built-in list (whether chosen or the default). This is because we use different
+    // variable types for each case.
+    let (custom_list, built_in_list) = match opt.custom_list_file_path {
+        Some(custom_list_file_path) => (Some(read_in_custom_list(&custom_list_file_path)), None),
+        None => (None, Some(fetch_list(opt.list_choice))),
+    };
+
+    // If a "custom_list" was given by the user, we're going to use that list.
+    // Otherwise we use the built-in list (a default list if the user didn't choose one).
+
+    // To get the length of the list we're going to use, we need to check if a
+    // custom_list was given.
+    let list_length = match custom_list {
+        Some(ref custom_list) => custom_list.len(),
+        None => built_in_list.unwrap().len(), // pretty sure we're safe to unwrap here...
+    };
 
     // Since user can define a minimum entropy, we might have to do a little math to
     // figure out how many words we need to include in this passphrase.
     let number_of_words_to_put_in_passphrase = calculate_number_words_needed(
         opt.number_of_words,
         opt.minimum_entropy,
         opt.strength_count,
-        list.len(),
+        list_length,
     );
 
     // If user enabled verbose option
@@ -99,27 +132,38 @@ fn main() {
         // to the terminal
         print_entropy(
             number_of_words_to_put_in_passphrase,
-            list.len(),
+            list_length,
             opt.n_passphrases,
         );
     }
 
+    // Now we can (finally) generate and print some number of passphrases
     for _ in 0..opt.n_passphrases {
-        // Generate and print passphrase
-        println!(
-            "{}",
-            generate_passphrase(
+        // Again, we have more code than we should because of this pesky list type situation...
+        let passphrase = match (&custom_list, built_in_list) {
+            (Some(ref custom_list), _) => generate_passphrase(
                 number_of_words_to_put_in_passphrase,
                 &opt.separator,
                 opt.title_case,
-                list,
-            )
-        );
+                custom_list,
+            ),
+            (None, Some(built_in_list)) => generate_passphrase(
+                number_of_words_to_put_in_passphrase,
+                &opt.separator,
+                opt.title_case,
+                built_in_list,
+            ),
+            (None, None) => panic!("List selection error!"),
+        };
+        println!("{}", passphrase);
     }
 }
 
+/// Print the calculated (estimated) entropy of a passphrase, based on three variables
 fn print_entropy(number_of_words: usize, list_length: usize, n_passphrases: usize) {
     let passphrase_entropy = (list_length as f64).log2() * number_of_words as f64;
+    // Depending on how many different passphrases the user wants printed, change the printed text
+    // accordingly
     if n_passphrases == 1 {
         eprintln!(
             "Passphrase has an estimated {:.2} bits of entropy.",
@@ -149,3 +193,48 @@ fn parse_list_choice(list_choice: &str) -> Result<List, String> {
         )),
     }
 }
+
+/// Read text file into a Vec<String>. Also trims whitespace, avoids adding blank strings,
+/// sorts, de-duplicates, and checks for uniform Unicode normalization.
+fn read_in_custom_list(file_path: &Path) -> Vec<String> {
+    let file_input: Vec<String> = match read_by_line(file_path.to_path_buf()) {
+        Ok(r) => r,
+        Err(e) => panic!("Error reading word list file: {}", e),
+    };
+    let mut word_list: Vec<String> = vec![];
+    for line in file_input {
+        // Don't add blank lines or lines made up purely of whitespace
+        if line.trim() != "" {
+            // Remove any starting or trailing whitespace before adding word to list
+            word_list.push(line.trim().to_string());
+        }
+    }
+    // Remove any duplicate words, since duplicate words would undermine entropy estimates.
+    word_list.sort();
+    word_list.dedup();
+    if !uniform_unicode_normalization(&word_list) {
+        eprintln!("WARNING: Custom word list has multiple Unicode normalizations. Consider normalizing the Unicode of all words on the list before making a passphrase.");
+    }
+    word_list
+}
+
+/// Generatic function that reads a file in, line by line.
+/// Not sure if all of this is necessary, but it gets the job done.
+fn read_by_line<T: FromStr>(file_path: PathBuf) -> io::Result<Vec<T>>
+where
+    <T as std::str::FromStr>::Err: std::fmt::Debug,
+{
+    let mut vec = Vec::new();
+    let f = match File::open(file_path) {
+        Ok(res) => res,
+        Err(e) => return Err(e),
+    };
+    let file = BufReader::new(&f);
+    for line in file.lines() {
+        match line?.parse() {
+            Ok(l) => vec.push(l),
+            Err(e) => panic!("Error parsing line from file: {:?}", e),
+        }
+    }
+    Ok(vec)
+}
diff --git a/src/unicode_normalization_check.rs b/src/unicode_normalization_check.rs
@@ -0,0 +1,59 @@
+use std::collections::HashSet;
+use unicode_normalization::is_nfc_quick;
+use unicode_normalization::is_nfd_quick;
+use unicode_normalization::is_nfkc_quick;
+use unicode_normalization::is_nfkd_quick;
+use unicode_normalization::IsNormalized;
+
+/// Given a slice of Strings, this function will attempt to detect the Unicode normalization used
+/// in each String.
+/// There are 4 different Unicode normalizations: NFC, NFD, NFKC, NFKD. Which ever one lists uses
+/// isn't a concern. What IS a concern is if one list uses MORE THAN ONE normalization.
+/// Thus, this functions counts how many DIFFERENT normalizations it finds. If it's more than 1
+/// type, it returns false, since the list does not have what I call "uniform Unicdoe
+/// normalization." Elsewhere, we warn the user about this.
+pub fn uniform_unicode_normalization(list: &[String]) -> bool {
+    let mut types_of_normalizations_discovered = HashSet::new();
+    for word in list {
+        if is_nfc_quick(word.chars()) == IsNormalized::Yes {
+            types_of_normalizations_discovered.insert("NFC");
+        } else if is_nfd_quick(word.chars()) == IsNormalized::Yes {
+            types_of_normalizations_discovered.insert("NFD");
+        } else if is_nfkc_quick(word.chars()) == IsNormalized::Yes {
+            types_of_normalizations_discovered.insert("NFKC");
+        } else if is_nfkd_quick(word.chars()) == IsNormalized::Yes {
+            types_of_normalizations_discovered.insert("NFKD");
+        }
+        // If we've already found more than 1 normalization, we can skip the
+        // rest of the list and return false
+        if types_of_normalizations_discovered.len() > 1 {
+            return false;
+        }
+    }
+    types_of_normalizations_discovered.len() == 1
+}
+
+#[test]
+fn can_detect_non_uniform_unicode_normalization_in_a_given_list() {
+    let version1 = "sécréter";
+    let version2 = "sécréter";
+    let non_uniform_list = vec![version1.to_string(), version2.to_string()];
+    assert!(!uniform_unicode_normalization(&non_uniform_list));
+
+    let uniform_list = vec![
+        "alpha".to_string(),
+        "beta".to_string(),
+        "charlie".to_string(),
+    ];
+    assert!(uniform_unicode_normalization(&uniform_list));
+
+    let uniform_list2 = vec![
+        "alpha".to_string(),
+        "beta".to_string(),
+        version1.to_string(), // add one word with an accented character
+        "charlie".to_string(),
+        version1.to_string(), // twice
+    ];
+    // Should still be detected as uniform
+    assert!(uniform_unicode_normalization(&uniform_list2));
+}