chronotope · PhilipDaniels · Jul 24, 2020 · Jul 27, 2020 · Jul 27, 2020 · Jul 27, 2020
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -42,6 +42,12 @@ jobs:
       run: cargo test --color=always -- --color=always
     - name: Run tests serde
       run: cargo test --features serde --color=always -- --color=always
+    - name: Run regex tests
+      working-directory: ./tests/check-regex-filtering
+      env:
+        CHRONO_TZ_BUILD_TIMEZONES: (Europe/London|GMT)
+      run: cargo test --color=always -- --color=always
+
     - name: Check with no default features
       run: cargo check --no-default-features --color=always
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,7 @@ std = []
 
 [build-dependencies]
 parse-zoneinfo = { version = "0.3" }
+regex = { default-features = false, version = "1"  }
 
 [dev-dependencies]
 serde_test = "1"

diff --git a/README.md b/README.md
@@ -160,6 +160,25 @@ lto = true
 Otherwise, the additional binary size added by this library may overflow
 available program space and trigger a linker error.
 
+## Limiting the Timezone Table to Zones of Interest
+
+`Chrono-tz` by default generates timezones for all entries in the
+[IANA database](http://www.iana.org/time-zones). If you are interested
+in only a few timezones you can use an environment variable to
+select them. The environment variable is called CHRONO_TZ_BUILD_TIMEZONES
+and is a regular expression. It should be specified in your top-level build:
+
+```sh
+CHRONO_TZ_BUILD_TIMEZONES="(Europe/London|US/.*)" cargo build
+```
+
+This can significantly reduce the size of the generated database, depending
+on how many timezones you are interested in. Wikipedia has an [article
+listing the timezone names](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).
+
+The filtering applied is liberal; if you use a pattern such as "US/.*" then `chrono-tz` will
+include all the zones that are linked, such as 'America/Denver', not just 'US/Mountain'.
+
 ## Future Improvements
 
 - Handle leap seconds

diff --git a/build.rs b/build.rs
@@ -1,6 +1,7 @@
 extern crate parse_zoneinfo;
+extern crate regex;
 
-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashSet};
 use std::env;
 use std::fs::File;
 use std::io::{self, BufRead, BufReader, Write};
@@ -12,6 +13,8 @@ use parse_zoneinfo::table::{Table, TableBuilder};
 use parse_zoneinfo::transitions::FixedTimespan;
 use parse_zoneinfo::transitions::TableTransitions;
 
+use regex::Regex;
+
 // This function is needed until zoneinfo_parse handles comments correctly.
 // Technically a '#' symbol could occur between double quotes and should be
 // ignored in this case, however this never happens in the tz database as it
@@ -274,7 +277,101 @@ fn write_directory_file(directory_file: &mut File, table: &Table) -> io::Result<
     Ok(())
 }
 
+/// The name of the environment variable which possibly holds the filter regex.
+const FILTER_ENV_VAR_NAME: &str = "CHRONO_TZ_BUILD_TIMEZONES";
+
+/// Checks the CHRONO_TZ_BUILD_TIMEZONES environment variable.
+/// Converts it to a regex if set. Panics if the regex is not valid, as we want
+/// to fail the build if that happens.
+fn get_filter_regex() -> Option<Regex> {
+    match env::var(FILTER_ENV_VAR_NAME) {
+        Ok(val) => {
+            let val = val.trim();
+            if val.is_empty() {
+                return None;
+            }
+            match Regex::new(val) {
+                Ok(regex) => Some(regex),
+                Err(err) => panic!(
+                    "The value '{:?}' for environment variable {} is not a valid regex, err={}",
+                    val, FILTER_ENV_VAR_NAME, err
+                ),
+            }
+        }
+        Err(env::VarError::NotPresent) => None,
+        Err(env::VarError::NotUnicode(s)) => panic!(
+            "The value '{:?}' for environment variable {} is not valid Unicode",
+            s, FILTER_ENV_VAR_NAME
+        ),
+    }
+}
+
+/// Insert a new name in the list of names to keep. If the name has 3
+/// parts, then also insert the 2-part prefix. If we don't do this we will lose
+/// half of Indiana in `directory.rs`. But we *don't* want to keep one-part names,
+/// otherwise we will inevitably end up with 'America' and include too much as
+/// a consequence.
+fn insert_keep_entry(keep: &mut HashSet<String>, new_value: &str) {
+    let mut parts = new_value.split('/');
+    if let (Some(p1), Some(p2), Some(_), None) =
+        (parts.next(), parts.next(), parts.next(), parts.next())
+    {
+        keep.insert(format!("{}/{}", p1, p2));
+    }
+
+    keep.insert(new_value.to_string());
+}
+
+/// Filter `table` by applying `filter_regex`.
+fn filter_timezone_table(table: &mut Table, filter_regex: Regex) {
+    // Compute the transitive closure of things to keep.
+    // Doing this, instead of just filtering `zonesets` and `links` by the
+    // regiex, helps to keep the `structure()` intact.
+    let mut keep = HashSet::new();
+    for (k, v) in &table.links {
+        if filter_regex.is_match(k) {
+            insert_keep_entry(&mut keep, k);
+        }
+        if filter_regex.is_match(v) {
+            insert_keep_entry(&mut keep, v);
+        }
+    }
+
+    let mut n = 0;
+    loop {
+        let len = keep.len();
+
+        for (k, v) in &table.links {
+            if keep.contains(k) && !keep.contains(v) {
+                insert_keep_entry(&mut keep, v);
+            }
+            if keep.contains(v) && !keep.contains(k) {
+                insert_keep_entry(&mut keep, k);
+            }
+        }
+
+        if keep.len() == len {
+            break;
+        }
+
+        n += 1;
+        if n == 50 {
+            println!("cargo:warning=Recursion limit reached while building filter list");
+            break;
+        }
+    }
+
+    // Actually do the filtering.
+    table.links.retain(|k, v| keep.contains(k) || keep.contains(v));
+
+    table
+        .zonesets
+        .retain(|k, _| filter_regex.is_match(&k) || keep.iter().any(|s| k.starts_with(s)));
+}
+
 fn main() {
+    println!("cargo:rerun-if-env-changed={}", FILTER_ENV_VAR_NAME);
+
     let parser = LineParser::new();
     let mut table = TableBuilder::new();
 
@@ -310,10 +407,16 @@ fn main() {
             Line::Space => {}
         }
     }
-    let table = table.build();
+
+    let mut table = table.build();
+    if let Some(filter_regex) = get_filter_regex() {
+        filter_timezone_table(&mut table, filter_regex);
+    }
+
     let timezone_path = Path::new(&env::var("OUT_DIR").unwrap()).join("timezones.rs");
     let mut timezone_file = File::create(&timezone_path).unwrap();
     write_timezone_file(&mut timezone_file, &table).unwrap();
+
     let directory_path = Path::new(&env::var("OUT_DIR").unwrap()).join("directory.rs");
     let mut directory_file = File::create(&directory_path).unwrap();
     write_directory_file(&mut directory_file, &table).unwrap();

diff --git a/tests/check-regex-filtering/Cargo.toml b/tests/check-regex-filtering/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "check-regex-filtering"
+version = "0.1.0"
+authors = ["Philip Daniels <Philip.Daniels1971@gmail.com>"]
+edition = "2018"
+
+[dependencies]
+chrono = "0.4"
+chrono-tz = { path = "../../", default-features = false }
diff --git a/tests/check-regex-filtering/src/lib.rs b/tests/check-regex-filtering/src/lib.rs
@@ -0,0 +1,67 @@
+/// This test is compiled by the Github workflows with the
+/// filter regex set thusly: CHRONO_TZ_BUILD_TIMEZONES="(Europe/London|GMT)"
+///
+/// We use it to check two things:
+/// 1) That the compiled chrono-tz contains the correct timezones (a compilation
+///    failure will result if it doesn't).
+/// 2) That the compiled chrono-tz DOES NOT contain other, non-matched,
+///    timezones. This is rather trickier to do without triggering a compilation
+///    failure: we try our best by looking over the TZ_VARIANTS array to try and
+///    ascertain if it contains anything obviously wrong.
+
+#[cfg(test)]
+mod tests {
+    use chrono::offset::TimeZone;
+    use chrono_tz::{Europe, Europe::London, Tz, TZ_VARIANTS};
+    use std::str::FromStr;
+
+    #[test]
+    fn london_compiles() {
+        // This line will be a compilation failure if the code generation
+        // mistakenly excluded Europe::London.
+        let _london_time = London.ymd(2013, 12, 25).and_hms(14, 0, 0);
+        assert_eq!("Europe/London", London.name());
+
+        // Since London is included, converting from the corresponding
+        // string representation should also work.
+        assert_eq!(Tz::from_str("Europe/London"), Ok(London));
+
+        // We did not explicitly ask for Isle Of Man or Belfast in our regex, but there is a link
+        // from Europe::London to Isle_of_Man and Belfast (amongst others)
+        // so these conversions should also work.
+        assert_eq!(Tz::from_str("Europe/Isle_of_Man"), Ok(Europe::Isle_of_Man));
+        assert_eq!(Tz::from_str("Europe/Belfast"), Ok(Europe::Belfast));
+    }
+
+    #[test]
+    fn excluded_things_are_missing() {
+        // Timezones from outside Europe should not be included.
+        // We can't test all possible strings, here we just handle a
+        // representative set.
+        assert!(Tz::from_str("Australia/Melbourne").is_err());
+        assert!(Tz::from_str("Indian/Maldives").is_err());
+        assert!(Tz::from_str("Mexico/BajaSur").is_err());
+        assert!(Tz::from_str("Pacific/Kwajalein").is_err());
+        assert!(Tz::from_str("US/Central").is_err());
+
+        // The link table caused us to include some extra items from the UK (see
+        // `london_compiles()`), but it should NOT include various other timezones
+        // from around Europe since there is no linkage between them.
+        assert!(Tz::from_str("Europe/Brussels").is_err());
+        assert!(Tz::from_str("Europe/Dublin").is_err());
+        assert!(Tz::from_str("Europe/Warsaw").is_err());
+
+        // Also, entire continents outside Europe should be excluded.
+        for tz in TZ_VARIANTS.iter() {
+            assert!(!tz.name().starts_with("Africa"));
+            assert!(!tz.name().starts_with("Asia"));
+            assert!(!tz.name().starts_with("Australia"));
+            assert!(!tz.name().starts_with("Canada"));
+            assert!(!tz.name().starts_with("Chile"));
+            assert!(!tz.name().starts_with("Indian"));
+            assert!(!tz.name().starts_with("Mexico"));
+            assert!(!tz.name().starts_with("Pacific"));
+            assert!(!tz.name().starts_with("US"));
+        }
+    }
+}