chronotope · PhilipDaniels · Jul 24, 2020 · Jul 27, 2020 · Jul 27, 2020 · Jul 27, 2020
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,7 @@ std = []
 
 [build-dependencies]
 parse-zoneinfo = { version = "0.3" }
+regex = "1"
 
 [dev-dependencies]
 serde_test = "1"

diff --git a/README.md b/README.md
@@ -160,6 +160,25 @@ lto = true
 Otherwise, the additional binary size added by this library may overflow
 available program space and trigger a linker error.
 
+## Limiting the Timezone Table to Zones of Interest
+
+`Chrono-tz` by default generates timezones for all entries in the
+[IANA database](http://www.iana.org/time-zones). If you are interested
+in only a few timezones you can use an environment variable to
+select them. The environment variable is called CHRONO_TZ_BUILD_TIMEZONES
+and is a regular expression. It should be specified in your top-level build:
+
+```sh
+CHRONO_TZ_BUILD_TIMEZONES="(Europe/London|US/.*)" cargo build
+```
+
+This can significantly reduce the size of the generated database, depending
+on how many timezones you are interested in. Wikipedia has an [article
+listing the timezone names](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).
+
+The filtering applied is liberal; if you use a pattern such as "US/.*" then `chrono-tz` will
+include all the zones that are linked, such as 'America/Denver', not just 'US/Mountain'.
+
 ## Future Improvements
 
 - Handle leap seconds

diff --git a/build.rs b/build.rs
@@ -1,6 +1,7 @@
 extern crate parse_zoneinfo;
+extern crate regex;
 
-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashSet};
 use std::env;
 use std::fs::File;
 use std::io::{self, BufRead, BufReader, Write};
@@ -12,6 +13,8 @@ use parse_zoneinfo::table::{Table, TableBuilder};
 use parse_zoneinfo::transitions::FixedTimespan;
 use parse_zoneinfo::transitions::TableTransitions;
 
+use regex::Regex;
+
 // This function is needed until zoneinfo_parse handles comments correctly.
 // Technically a '#' symbol could occur between double quotes and should be
 // ignored in this case, however this never happens in the tz database as it
@@ -274,7 +277,101 @@ fn write_directory_file(directory_file: &mut File, table: &Table) -> io::Result<
     Ok(())
 }
 
+/// The name of the environment variable which possibly holds the filter regex.
+const FILTER_ENV_VAR_NAME: &str = "CHRONO_TZ_BUILD_TIMEZONES";
+
+/// Checks the CHRONO_TZ_BUILD_TIMEZONES environment variable.
+/// Converts it to a regex if set. Panics if the regex is not valid, as we want
+/// to fail the build if that happens.
+fn get_filter_regex() -> Option<Regex> {
+    match env::var(FILTER_ENV_VAR_NAME) {
+        Ok(val) => {
+            let val = val.trim();
+            if val.is_empty() {
+                return None;
+            }
+            match Regex::new(val) {
+                Ok(regex) => return Some(regex),
+                Err(err) => panic!(
+                    "The value '{:?}' for environment variable {} is not a valid regex, err={}",
+                    val, FILTER_ENV_VAR_NAME, err
+                ),
+            }
+        }
+        Err(env::VarError::NotPresent) => return None,
+        Err(env::VarError::NotUnicode(s)) => panic!(
+            "The value '{:?}' for environment variable {} is not valid Unicode",
+            s, FILTER_ENV_VAR_NAME
+        ),
+    }
+}
+
+/// Insert a new name in the list of names to keep. If the name has 3
+/// parts, then also insert the 2-part prefix. If we don't do this we will lose
+/// half of Indiana in `directory.rs`. But we *don't* want to keep one-part names,
+/// otherwise we will inevitably end up with 'America' and include too much as
+/// a consequence.
+fn insert_keep_entry(keep: &mut HashSet<String>, new_value: &str) {
+    let mut parts = new_value.split('/');
+    if let (Some(p1), Some(p2), Some(_), None) =
+        (parts.next(), parts.next(), parts.next(), parts.next())
+    {
+        keep.insert(format!("{}/{}", p1, p2));
+    }
+
+    keep.insert(new_value.to_string());
+}
+
+/// Filter `table` by applying `filter_regex`.
+fn filter_timezone_table(table: &mut Table, filter_regex: Regex) {
+    // Compute the transitive closure of things to keep.
+    // Doing this, instead of just filtering `zonesets` and `links` by the
+    // regiex, helps to keep the `structure()` intact.
+    let mut keep = HashSet::new();
+    for (k, v) in &table.links {
+        if filter_regex.is_match(k) {
+            insert_keep_entry(&mut keep, k);
+        }
+        if filter_regex.is_match(v) {
+            insert_keep_entry(&mut keep, v);
+        }
+    }
+
+    let mut n = 0;
+    loop {
+        let len = keep.len();
+
+        for (k, v) in &table.links {
+            if keep.contains(k) && !keep.contains(v) {
+                insert_keep_entry(&mut keep, v);
+            }
+            if keep.contains(v) && !keep.contains(k) {
+                insert_keep_entry(&mut keep, k);
+            }
+        }
+
+        if keep.len() == len {
+            break;
+        }
+
+        n += 1;
+        if n == 50 {
+            println!("cargo:warning=Recursion limit reached while building filter list");
+            break;
+        }
+    }
+
+    // Actually do the filtering.
+    table.links.retain(|k, v| keep.contains(k) || keep.contains(v));
+
+    table
+        .zonesets
+        .retain(|k, _| filter_regex.is_match(&k) || keep.iter().any(|s| k.starts_with(s)));
+}
+
 fn main() {
+    println!("cargo:rerun-if-env-changed={}", FILTER_ENV_VAR_NAME);
+
     let parser = LineParser::new();
     let mut table = TableBuilder::new();
 
@@ -310,10 +407,16 @@ fn main() {
             Line::Space => {}
         }
     }
-    let table = table.build();
+
+    let mut table = table.build();
+    if let Some(filter_regex) = get_filter_regex() {
+        filter_timezone_table(&mut table, filter_regex);
+    }
+
     let timezone_path = Path::new(&env::var("OUT_DIR").unwrap()).join("timezones.rs");
     let mut timezone_file = File::create(&timezone_path).unwrap();
     write_timezone_file(&mut timezone_file, &table).unwrap();
+
     let directory_path = Path::new(&env::var("OUT_DIR").unwrap()).join("directory.rs");
     let mut directory_file = File::create(&directory_path).unwrap();
     write_directory_file(&mut directory_file, &table).unwrap();