diff --git a/components/calendar/data/config.json b/components/calendar/data/config.json index 1b57ebf877a..031f5c59dd8 100644 --- a/components/calendar/data/config.json +++ b/components/calendar/data/config.json @@ -1,21 +1,21 @@ { - "keys": { - "Explicit": [ - "calendar/japanese@1", - "calendar/japanext@1", - "datetime/week_data@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "calendar/japanese@1", + "calendar/japanext@1", + "datetime/week_data@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/casemap/data/config.json b/components/casemap/data/config.json index 6adc1006ee3..69352207dfa 100644 --- a/components/casemap/data/config.json +++ b/components/casemap/data/config.json @@ -1,20 +1,20 @@ { - "keys": { - "Explicit": [ - "props/casemap@1", - "props/casemap_unfold@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "props/casemap@1", + "props/casemap_unfold@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/collator/data/config.json b/components/collator/data/config.json index c5443df8daf..d6d7ed8762c 100644 --- a/components/collator/data/config.json +++ b/components/collator/data/config.json @@ -1,25 +1,24 @@ { - "keys": { - "Explicit": [ - "collator/data@1", - "collator/dia@1", - "collator/jamo@1", - "collator/meta@1", - "collator/prim@1", - "collator/reord@1" - - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "collator/data@1", + "collator/dia@1", + "collator/jamo@1", + "collator/meta@1", + "collator/prim@1", + "collator/reord@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/datetime/data/config.json b/components/datetime/data/config.json index e435107a7c6..0af81dbb6cd 100644 --- a/components/datetime/data/config.json +++ b/components/datetime/data/config.json @@ -1,54 +1,53 @@ { - "keys": { - "Explicit": [ - "datetime/buddhist/datelengths@1", - "datetime/buddhist/datesymbols@1", - "datetime/chinese/datelengths@1", - "datetime/chinese/datesymbols@1", - "datetime/coptic/datelengths@1", - "datetime/coptic/datesymbols@1", - "datetime/dangi/datelengths@1", - "datetime/dangi/datesymbols@1", - "datetime/ethiopic/datelengths@1", - "datetime/ethiopic/datesymbols@1", - "datetime/gregory/datelengths@1", - "datetime/gregory/datesymbols@1", - "datetime/hebrew/datelengths@1", - "datetime/hebrew/datesymbols@1", - "datetime/indian/datelengths@1", - "datetime/indian/datesymbols@1", - "datetime/islamic/datelengths@1", - "datetime/islamic/datesymbols@1", - "datetime/japanese/datelengths@1", - "datetime/japanese/datesymbols@1", - "datetime/japanext/datelengths@1", - "datetime/japanext/datesymbols@1", - "datetime/persian/datelengths@1", - "datetime/persian/datesymbols@1", - "datetime/roc/datelengths@1", - "datetime/roc/datesymbols@1", - "datetime/skeletons@1", - "datetime/timelengths@1", - "datetime/timesymbols@1", - "time_zone/exemplar_cities@1", - "time_zone/formats@1", - "time_zone/generic_long@1", - "time_zone/generic_short@1", - "time_zone/specific_long@1", - "time_zone/specific_short@1" - - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "datetime/buddhist/datelengths@1", + "datetime/buddhist/datesymbols@1", + "datetime/chinese/datelengths@1", + "datetime/chinese/datesymbols@1", + "datetime/coptic/datelengths@1", + "datetime/coptic/datesymbols@1", + "datetime/dangi/datelengths@1", + "datetime/dangi/datesymbols@1", + "datetime/ethiopic/datelengths@1", + "datetime/ethiopic/datesymbols@1", + "datetime/gregory/datelengths@1", + "datetime/gregory/datesymbols@1", + "datetime/hebrew/datelengths@1", + "datetime/hebrew/datesymbols@1", + "datetime/indian/datelengths@1", + "datetime/indian/datesymbols@1", + "datetime/islamic/datelengths@1", + "datetime/islamic/datesymbols@1", + "datetime/japanese/datelengths@1", + "datetime/japanese/datesymbols@1", + "datetime/japanext/datelengths@1", + "datetime/japanext/datesymbols@1", + "datetime/persian/datelengths@1", + "datetime/persian/datesymbols@1", + "datetime/roc/datelengths@1", + "datetime/roc/datesymbols@1", + "datetime/skeletons@1", + "datetime/timelengths@1", + "datetime/timesymbols@1", + "time_zone/exemplar_cities@1", + "time_zone/formats@1", + "time_zone/generic_long@1", + "time_zone/generic_short@1", + "time_zone/specific_long@1", + "time_zone/specific_short@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/datetime/tests/data/blob.json b/components/datetime/tests/data/blob.json index a24fb853ef2..196e09c06dd 100644 --- a/components/datetime/tests/data/blob.json +++ b/components/datetime/tests/data/blob.json @@ -1,26 +1,28 @@ { - "keys": { - "Explicit": [ - "datetime/gregory/datelengths@1", - "datetime/gregory/datesymbols@1", - "datetime/timelengths@1", - "datetime/timesymbols@1", - "decimal/symbols@1", - "time_zone/formats@1", - "time_zone/specific_short@1" - ] - }, - "locales": { - "Explicit": [ - "en" - ] - }, - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Blob": "blob.postcard" - }, - "fallback": "Hybrid", - "overwrite": true + "keys": { + "explicit": [ + "datetime/gregory/datelengths@1", + "datetime/gregory/datesymbols@1", + "datetime/timelengths@1", + "datetime/timesymbols@1", + "decimal/symbols@1", + "time_zone/formats@1", + "time_zone/specific_short@1" + ] + }, + "fallback": "hybrid", + "locales": { + "explicit": [ + "en" + ] + }, + "cldr": "latest", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "blob": { + "path": "blob.postcard" + } + }, + "overwrite": true } \ No newline at end of file diff --git a/components/datetime/tests/data/blob.postcard b/components/datetime/tests/data/blob.postcard index a1e02a68eb2..4ab10b1ebbf 100644 Binary files a/components/datetime/tests/data/blob.postcard and b/components/datetime/tests/data/blob.postcard differ diff --git a/components/decimal/data/config.json b/components/decimal/data/config.json index dfb8e26cc03..a1d88598c78 100644 --- a/components/decimal/data/config.json +++ b/components/decimal/data/config.json @@ -1,19 +1,19 @@ { - "keys": { - "Explicit": [ - "decimal/symbols@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "decimal/symbols@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/list/data/config.json b/components/list/data/config.json index aacf4ad76a7..749e9227430 100644 --- a/components/list/data/config.json +++ b/components/list/data/config.json @@ -1,22 +1,21 @@ { - "keys": { - "Explicit": [ - "list/and@1", - "list/or@1", - "list/unit@1" - - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "list/and@1", + "list/or@1", + "list/unit@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/locid_transform/data/config.json b/components/locid_transform/data/config.json index cb7e223ebbf..78e3d8b841f 100644 --- a/components/locid_transform/data/config.json +++ b/components/locid_transform/data/config.json @@ -1,26 +1,26 @@ { - "keys": { - "Explicit": [ - "fallback/likelysubtags@1", - "fallback/parents@1", - "fallback/supplement/co@1", - "locid_transform/aliases@1", - "locid_transform/likelysubtags_ext@1", - "locid_transform/likelysubtags_l@1", - "locid_transform/likelysubtags_sr@1", - "locid_transform/script_dir@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "fallback/likelysubtags@1", + "fallback/parents@1", + "fallback/supplement/co@1", + "locid_transform/aliases@1", + "locid_transform/likelysubtags_ext@1", + "locid_transform/likelysubtags_l@1", + "locid_transform/likelysubtags_sr@1", + "locid_transform/script_dir@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/normalizer/data/config.json b/components/normalizer/data/config.json index 1821929c297..4c59b569fd2 100644 --- a/components/normalizer/data/config.json +++ b/components/normalizer/data/config.json @@ -1,25 +1,25 @@ { - "keys": { - "Explicit": [ - "normalizer/comp@1", - "normalizer/decomp@1", - "normalizer/nfd@1", - "normalizer/nfdex@1", - "normalizer/nfkd@1", - "normalizer/nfkdex@1", - "normalizer/uts46d@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "normalizer/comp@1", + "normalizer/decomp@1", + "normalizer/nfd@1", + "normalizer/nfdex@1", + "normalizer/nfkd@1", + "normalizer/nfkdex@1", + "normalizer/uts46d@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/plurals/data/config.json b/components/plurals/data/config.json index 30fab3cd116..086f061dbb9 100644 --- a/components/plurals/data/config.json +++ b/components/plurals/data/config.json @@ -1,20 +1,20 @@ { - "keys": { - "Explicit": [ - "plurals/ordinal@1", - "plurals/cardinal@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "plurals/cardinal@1", + "plurals/ordinal@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/properties/data/config.json b/components/properties/data/config.json index 81c68003bf4..ddf3a3ee47d 100644 --- a/components/properties/data/config.json +++ b/components/properties/data/config.json @@ -1,128 +1,128 @@ { - "keys": { - "Explicit": [ - "propnames/from/bc@1", - "propnames/from/ccc@1", - "propnames/from/ea@1", - "propnames/from/gc@1", - "propnames/from/GCB@1", - "propnames/from/gcm@1", - "propnames/from/lb@1", - "propnames/from/SB@1", - "propnames/from/sc@1", - "propnames/from/WB@1", - "propnames/to/long/linear/bc@1", - "propnames/to/long/linear/ea@1", - "propnames/to/long/linear/gc@1", - "propnames/to/long/linear/GCB@1", - "propnames/to/long/linear/lb@1", - "propnames/to/long/linear/SB@1", - "propnames/to/long/linear/sc@1", - "propnames/to/long/linear/WB@1", - "propnames/to/long/sparse/ccc@1", - "propnames/to/short/linear/bc@1", - "propnames/to/short/linear/ea@1", - "propnames/to/short/linear/gc@1", - "propnames/to/short/linear/GCB@1", - "propnames/to/short/linear/lb@1", - "propnames/to/short/linear/SB@1", - "propnames/to/short/linear/WB@1", - "propnames/to/short/linear4/sc@1", - "propnames/to/short/sparse/ccc@1", - "props/AHex@1", - "props/alnum@1", - "props/Alpha@1", - "props/Basic_Emoji@1", - "props/bc@1", - "props/Bidi_C@1", - "props/Bidi_M@1", - "props/bidiauxiliaryprops@1", - "props/blank@1", - "props/Cased@1", - "props/ccc@1", - "props/CI@1", - "props/Comp_Ex@1", - "props/CWCF@1", - "props/CWCM@1", - "props/CWKCF@1", - "props/CWL@1", - "props/CWT@1", - "props/CWU@1", - "props/Dash@1", - "props/Dep@1", - "props/DI@1", - "props/Dia@1", - "props/ea@1", - "props/EBase@1", - "props/EComp@1", - "props/EMod@1", - "props/Emoji@1", - "props/EPres@1", - "props/exemplarchars/auxiliary@1", - "props/exemplarchars/index@1", - "props/exemplarchars/main@1", - "props/exemplarchars/numbers@1", - "props/exemplarchars/punctuation@1", - "props/Ext@1", - "props/ExtPict@1", - "props/gc@1", - "props/GCB@1", - "props/Gr_Base@1", - "props/Gr_Ext@1", - "props/Gr_Link@1", - "props/graph@1", - "props/Hex@1", - "props/Hyphen@1", - "props/IDC@1", - "props/Ideo@1", - "props/IDS@1", - "props/IDSB@1", - "props/IDST@1", - "props/Join_C@1", - "props/lb@1", - "props/LOE@1", - "props/Lower@1", - "props/Math@1", - "props/NChar@1", - "props/nfcinert@1", - "props/nfdinert@1", - "props/nfkcinert@1", - "props/nfkdinert@1", - "props/Pat_Syn@1", - "props/Pat_WS@1", - "props/PCM@1", - "props/print@1", - "props/QMark@1", - "props/Radical@1", - "props/RI@1", - "props/SB@1", - "props/sc@1", - "props/scx@1", - "props/SD@1", - "props/segstart@1", - "props/Sensitive@1", - "props/STerm@1", - "props/Term@1", - "props/UIdeo@1", - "props/Upper@1", - "props/VS@1", - "props/WB@1", - "props/WSpace@1", - "props/xdigit@1", - "props/XIDC@1", - "props/XIDS@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "propnames/from/GCB@1", + "propnames/from/SB@1", + "propnames/from/WB@1", + "propnames/from/bc@1", + "propnames/from/ccc@1", + "propnames/from/ea@1", + "propnames/from/gc@1", + "propnames/from/gcm@1", + "propnames/from/lb@1", + "propnames/from/sc@1", + "propnames/to/long/linear/GCB@1", + "propnames/to/long/linear/SB@1", + "propnames/to/long/linear/WB@1", + "propnames/to/long/linear/bc@1", + "propnames/to/long/linear/ea@1", + "propnames/to/long/linear/gc@1", + "propnames/to/long/linear/lb@1", + "propnames/to/long/linear/sc@1", + "propnames/to/long/sparse/ccc@1", + "propnames/to/short/linear/GCB@1", + "propnames/to/short/linear/SB@1", + "propnames/to/short/linear/WB@1", + "propnames/to/short/linear/bc@1", + "propnames/to/short/linear/ea@1", + "propnames/to/short/linear/gc@1", + "propnames/to/short/linear/lb@1", + "propnames/to/short/linear4/sc@1", + "propnames/to/short/sparse/ccc@1", + "props/AHex@1", + "props/Alpha@1", + "props/Basic_Emoji@1", + "props/Bidi_C@1", + "props/Bidi_M@1", + "props/CI@1", + "props/CWCF@1", + "props/CWCM@1", + "props/CWKCF@1", + "props/CWL@1", + "props/CWT@1", + "props/CWU@1", + "props/Cased@1", + "props/Comp_Ex@1", + "props/DI@1", + "props/Dash@1", + "props/Dep@1", + "props/Dia@1", + "props/EBase@1", + "props/EComp@1", + "props/EMod@1", + "props/EPres@1", + "props/Emoji@1", + "props/Ext@1", + "props/ExtPict@1", + "props/GCB@1", + "props/Gr_Base@1", + "props/Gr_Ext@1", + "props/Gr_Link@1", + "props/Hex@1", + "props/Hyphen@1", + "props/IDC@1", + "props/IDS@1", + "props/IDSB@1", + "props/IDST@1", + "props/Ideo@1", + "props/Join_C@1", + "props/LOE@1", + "props/Lower@1", + "props/Math@1", + "props/NChar@1", + "props/PCM@1", + "props/Pat_Syn@1", + "props/Pat_WS@1", + "props/QMark@1", + "props/RI@1", + "props/Radical@1", + "props/SB@1", + "props/SD@1", + "props/STerm@1", + "props/Sensitive@1", + "props/Term@1", + "props/UIdeo@1", + "props/Upper@1", + "props/VS@1", + "props/WB@1", + "props/WSpace@1", + "props/XIDC@1", + "props/XIDS@1", + "props/alnum@1", + "props/bc@1", + "props/bidiauxiliaryprops@1", + "props/blank@1", + "props/ccc@1", + "props/ea@1", + "props/exemplarchars/auxiliary@1", + "props/exemplarchars/index@1", + "props/exemplarchars/main@1", + "props/exemplarchars/numbers@1", + "props/exemplarchars/punctuation@1", + "props/gc@1", + "props/graph@1", + "props/lb@1", + "props/nfcinert@1", + "props/nfdinert@1", + "props/nfkcinert@1", + "props/nfkdinert@1", + "props/print@1", + "props/sc@1", + "props/scx@1", + "props/segstart@1", + "props/xdigit@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/segmenter/data/config.json b/components/segmenter/data/config.json index d6bf5a71ef2..624cd593485 100644 --- a/components/segmenter/data/config.json +++ b/components/segmenter/data/config.json @@ -1,25 +1,25 @@ { - "keys": { - "Explicit": [ - "segmenter/dictionary/w_auto@1", - "segmenter/dictionary/wl_ext@1", - "segmenter/grapheme@1", - "segmenter/line@1", - "segmenter/lstm/wl_auto@1", - "segmenter/sentence@1", - "segmenter/word@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Hybrid", - "overwrite": true -} + "keys": { + "explicit": [ + "segmenter/dictionary/w_auto@1", + "segmenter/dictionary/wl_ext@1", + "segmenter/grapheme@1", + "segmenter/line@1", + "segmenter/lstm/wl_auto@1", + "segmenter/sentence@1", + "segmenter/word@1" + ] + }, + "fallback": "hybrid", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/components/timezone/data/config.json b/components/timezone/data/config.json index e574f0d31ff..937b0cf1069 100644 --- a/components/timezone/data/config.json +++ b/components/timezone/data/config.json @@ -1,19 +1,19 @@ { - "keys": { - "Explicit": [ - "time_zone/metazone_period@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "time_zone/metazone_period@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/experimental/compactdecimal/data/config.json b/experimental/compactdecimal/data/config.json index f2f88cb58c3..b0c5ebce36b 100644 --- a/experimental/compactdecimal/data/config.json +++ b/experimental/compactdecimal/data/config.json @@ -1,21 +1,20 @@ { - "keys": { - "Explicit": [ - "compactdecimal/long@1", - "compactdecimal/short@1" - - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "compactdecimal/long@1", + "compactdecimal/short@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/experimental/displaynames/data/config.json b/experimental/displaynames/data/config.json index 0987bab1e4d..897a70af555 100644 --- a/experimental/displaynames/data/config.json +++ b/experimental/displaynames/data/config.json @@ -1,23 +1,23 @@ { - "keys": { - "Explicit": [ - "displaynames/languages@1", - "displaynames/locales@1", - "displaynames/regions@1", - "displaynames/scripts@1", - "displaynames/variants@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true + "keys": { + "explicit": [ + "displaynames/languages@1", + "displaynames/locales@1", + "displaynames/regions@1", + "displaynames/scripts@1", + "displaynames/variants@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true } \ No newline at end of file diff --git a/experimental/relativetime/data/config.json b/experimental/relativetime/data/config.json index f854c55d8bb..31e1d25782d 100644 --- a/experimental/relativetime/data/config.json +++ b/experimental/relativetime/data/config.json @@ -1,42 +1,42 @@ { - "keys": { - "Explicit": [ - "relativetime/long/day@1", - "relativetime/long/hour@1", - "relativetime/long/minute@1", - "relativetime/long/month@1", - "relativetime/long/quarter@1", - "relativetime/long/second@1", - "relativetime/long/week@1", - "relativetime/long/year@1", - "relativetime/narrow/day@1", - "relativetime/narrow/hour@1", - "relativetime/narrow/minute@1", - "relativetime/narrow/month@1", - "relativetime/narrow/quarter@1", - "relativetime/narrow/second@1", - "relativetime/narrow/week@1", - "relativetime/narrow/year@1", - "relativetime/short/day@1", - "relativetime/short/hour@1", - "relativetime/short/minute@1", - "relativetime/short/month@1", - "relativetime/short/quarter@1", - "relativetime/short/second@1", - "relativetime/short/week@1", - "relativetime/short/year@1" - ] - }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", - "export": { - "Baked": { - "path": "data", - "pretty": true - } - }, - "fallback": "Runtime", - "overwrite": true -} + "keys": { + "explicit": [ + "relativetime/long/day@1", + "relativetime/long/hour@1", + "relativetime/long/minute@1", + "relativetime/long/month@1", + "relativetime/long/quarter@1", + "relativetime/long/second@1", + "relativetime/long/week@1", + "relativetime/long/year@1", + "relativetime/narrow/day@1", + "relativetime/narrow/hour@1", + "relativetime/narrow/minute@1", + "relativetime/narrow/month@1", + "relativetime/narrow/quarter@1", + "relativetime/narrow/second@1", + "relativetime/narrow/week@1", + "relativetime/narrow/year@1", + "relativetime/short/day@1", + "relativetime/short/hour@1", + "relativetime/short/minute@1", + "relativetime/short/month@1", + "relativetime/short/quarter@1", + "relativetime/short/second@1", + "relativetime/short/week@1", + "relativetime/short/year@1" + ] + }, + "fallback": "runtime", + "locales": "recommended", + "cldr": "latest", + "icuExport": "latest", + "segmenterLstm": "latest", + "export": { + "baked": { + "path": "data", + "pretty": true + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/experimental/single_number_formatter/data/config.json b/experimental/single_number_formatter/data/config.json index 5c3033a0e71..38d3320ea4d 100644 --- a/experimental/single_number_formatter/data/config.json +++ b/experimental/single_number_formatter/data/config.json @@ -1,19 +1,16 @@ { "keys": { - "Explicit": [ + "explicit": [ "currency/essentials@1" ] }, - "locales": "Recommended", - "cldr": "Latest", - "icu_export": "Latest", - "segmenter_lstm": "Latest", + "fallback": "runtime", + "locales": "recommended", "export": { - "Baked": { + "baked": { "path": "data", "pretty": true } }, - "fallback": "Runtime", "overwrite": true } diff --git a/provider/adapters/tests/data/config.json b/provider/adapters/tests/data/config.json index da1bbfad88e..db3a07ac122 100644 --- a/provider/adapters/tests/data/config.json +++ b/provider/adapters/tests/data/config.json @@ -1,21 +1,21 @@ { - "keys": { - "Explicit": [ - "fallback/likelysubtags@1", - "fallback/parents@1", - "fallback/supplement/co@1", - "core/helloworld@1" - ] - }, - "locales": "All", - "cldr": "Latest", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Blob": { - "path": "blob.postcard" - } - }, - "fallback": "RuntimeManual", - "overwrite": true + "keys": { + "explicit": [ + "core/helloworld@1", + "fallback/likelysubtags@1", + "fallback/parents@1", + "fallback/supplement/co@1" + ] + }, + "fallback": "runtimeManual", + "locales": "all", + "cldr": "latest", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "blob": { + "path": "blob.postcard" + } + }, + "overwrite": true } \ No newline at end of file diff --git a/provider/adapters/tests/data/langtest/de.json b/provider/adapters/tests/data/langtest/de.json index 7fd1fbac05f..8c69fa42518 100644 --- a/provider/adapters/tests/data/langtest/de.json +++ b/provider/adapters/tests/data/langtest/de.json @@ -1,23 +1,23 @@ { - "keys": { - "Explicit": [ - "core/helloworld@1" - ] - }, - "locales": { - "Explicit": [ - "de" - ] - }, - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Fs": { - "path": "de", - "syntax": "Json" - } - }, - "fallback": "Preresolved", - "overwrite": true - } \ No newline at end of file + "keys": { + "explicit": [ + "core/helloworld@1" + ] + }, + "fallback": "preresolved", + "locales": { + "explicit": [ + "de" + ] + }, + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "fileSystem": { + "path": "de", + "syntax": "json" + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/provider/adapters/tests/data/langtest/ro.json b/provider/adapters/tests/data/langtest/ro.json index bed32e9e24e..afae9e85ad8 100644 --- a/provider/adapters/tests/data/langtest/ro.json +++ b/provider/adapters/tests/data/langtest/ro.json @@ -1,23 +1,23 @@ { - "keys": { - "Explicit": [ - "core/helloworld@1" - ] - }, - "locales": { - "Explicit": [ - "ro" - ] - }, - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Fs": { - "path": "ro", - "syntax": "Json" - } - }, - "fallback": "Preresolved", - "overwrite": true - } \ No newline at end of file + "keys": { + "explicit": [ + "core/helloworld@1" + ] + }, + "fallback": "preresolved", + "locales": { + "explicit": [ + "ro" + ] + }, + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "fileSystem": { + "path": "ro", + "syntax": "json" + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index 7686af33d9c..415a31cab2c 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -18,14 +18,10 @@ //! let mut exporter = BlobExporter::new_with_sink(Box::new(&mut blob)); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DatagenDriver::new() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), exporter) +//! .unwrap(); //! //! // communicate the blob to the client application (network, disk, etc.) //! ``` diff --git a/provider/blob/tests/data/config.json b/provider/blob/tests/data/config.json index 00bedf6627a..5a1fab6c277 100644 --- a/provider/blob/tests/data/config.json +++ b/provider/blob/tests/data/config.json @@ -1,15 +1,16 @@ { "keys": { - "Explicit": [ + "explicit": [ "core/helloworld@1" ] }, - "locales": "All", - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", + "fallback": "hybrid", + "locales": "all", + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", "export": { - "Blob": { + "blob": { "path": "hello_world.postcard" } }, diff --git a/provider/core/src/datagen/mod.rs b/provider/core/src/datagen/mod.rs index 98757ce2b89..75c6a080328 100644 --- a/provider/core/src/datagen/mod.rs +++ b/provider/core/src/datagen/mod.rs @@ -90,9 +90,18 @@ pub trait DataExporter: Sync { /// A [`DynamicDataProvider`] that can be used for exporting data. /// /// Use [`make_exportable_provider`](crate::make_exportable_provider) to implement this. -pub trait ExportableProvider: IterableDynamicDataProvider + Sync {} +pub trait ExportableProvider: IterableDynamicDataProvider + Sync { + /// Returns a struct implementing `DataProvider` by downcasting + fn as_downcasting(&self) -> DowncastingExportableDataProvider { + DowncastingExportableDataProvider(self) + } +} impl ExportableProvider for T where T: IterableDynamicDataProvider + Sync {} +#[derive(Debug)] +#[doc(hidden)] +pub struct DowncastingExportableDataProvider<'a, P: ?Sized>(&'a P); + /// This macro can be used on a data provider to allow it to be used for data generation. /// /// Data generation 'compiles' data by using this data provider (which usually translates data from diff --git a/provider/core/src/datagen/payload.rs b/provider/core/src/datagen/payload.rs index 97e540b0741..906a8edefc5 100644 --- a/provider/core/src/datagen/payload.rs +++ b/provider/core/src/datagen/payload.rs @@ -95,6 +95,28 @@ where } } +impl<'a, P: super::ExportableProvider, M: KeyedDataMarker> DataProvider + for super::DowncastingExportableDataProvider<'a, P> +where + DataPayload: Clone, +{ + fn load(&self, req: DataRequest) -> Result, DataError> { + let (metadata, payload) = self.0.load_data(M::KEY, req)?.take_metadata_and_payload()?; + Ok(DataResponse { + payload: Some( + payload + .get() + .payload + .as_any() + .downcast_ref::>() + .ok_or_else(|| DataError::for_type::())? + .clone(), + ), + metadata, + }) + } +} + impl DataPayload { /// Serializes this [`DataPayload`] into a serializer using Serde. /// diff --git a/provider/datagen/README.md b/provider/datagen/README.md index 6360418f58e..6c5f3bc6b4e 100644 --- a/provider/datagen/README.md +++ b/provider/datagen/README.md @@ -14,21 +14,13 @@ Also see our [datagen tutorial](https://github.com/unicode-org/icu4x/blob/main/d ```rust use icu_datagen::prelude::*; -use icu_provider_blob::export::*; +use icu_datagen::blob_exporter::*; use std::fs::File; -fn main() { - DatagenProvider::default() - .export( - { - let mut options = options::Options::default(); - options.keys = [icu::list::provider::AndListV1Marker::KEY].into_iter().collect(); - options - }, - BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap())), - ) - .unwrap(); -} +DatagenDriver::new() + .with_keys([icu::list::provider::AndListV1Marker::KEY]) + .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap()))) + .unwrap(); ``` ### Command line diff --git a/provider/datagen/data/segmenter/rules/grapheme.toml b/provider/datagen/data/segmenter_rules/grapheme.toml similarity index 100% rename from provider/datagen/data/segmenter/rules/grapheme.toml rename to provider/datagen/data/segmenter_rules/grapheme.toml diff --git a/provider/datagen/data/segmenter/rules/line.toml b/provider/datagen/data/segmenter_rules/line.toml similarity index 100% rename from provider/datagen/data/segmenter/rules/line.toml rename to provider/datagen/data/segmenter_rules/line.toml diff --git a/provider/datagen/data/segmenter/rules/sentence.toml b/provider/datagen/data/segmenter_rules/sentence.toml similarity index 100% rename from provider/datagen/data/segmenter/rules/sentence.toml rename to provider/datagen/data/segmenter_rules/sentence.toml diff --git a/provider/datagen/data/segmenter/rules/word.toml b/provider/datagen/data/segmenter_rules/word.toml similarity index 100% rename from provider/datagen/data/segmenter/rules/word.toml rename to provider/datagen/data/segmenter_rules/word.toml diff --git a/provider/datagen/src/baked_exporter.rs b/provider/datagen/src/baked_exporter.rs index 8f31e3ca016..15e04b6eff4 100644 --- a/provider/datagen/src/baked_exporter.rs +++ b/provider/datagen/src/baked_exporter.rs @@ -19,14 +19,10 @@ //! let mut exporter = BakedExporter::new(demo_path.clone(), Default::default()).unwrap(); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DatagenDriver::new() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), exporter) +//! .unwrap(); //! # //! # let _ = std::fs::remove_dir_all(&demo_path); //! ``` diff --git a/provider/datagen/src/bin/datagen/args.rs b/provider/datagen/src/bin/datagen/args.rs index b0075a996a0..74a67539e46 100644 --- a/provider/datagen/src/bin/datagen/args.rs +++ b/provider/datagen/src/bin/datagen/args.rs @@ -272,7 +272,7 @@ impl Cli { } } match &mut config.export { - config::Export::Fs { path, .. } => { + config::Export::FileSystem { path, .. } => { if path.is_relative() { *path = parent.join(path.clone()); } @@ -418,7 +418,7 @@ impl Cli { }) } - fn make_segmenter_models(&self) -> eyre::Result { + fn make_segmenter_models(&self) -> eyre::Result { Ok(if self.segmenter_models.as_slice() == ["none"] { config::SegmenterModelInclude::None } else if self.segmenter_models.as_slice() == ["recommended"] { @@ -437,7 +437,7 @@ impl Cli { #[cfg(not(feature = "provider_fs"))] eyre::bail!("FsDataProvider export requires the provider_fs Cargo feature."); #[cfg(feature = "provider_fs")] - Ok(config::Export::Fs { + Ok(config::Export::FileSystem { path: if let Some(root) = self.output.as_ref() { root.clone() } else { diff --git a/provider/datagen/src/bin/datagen/config.rs b/provider/datagen/src/bin/datagen/config.rs index 2aad863c614..ab93bd58fb7 100644 --- a/provider/datagen/src/bin/datagen/config.rs +++ b/provider/datagen/src/bin/datagen/config.rs @@ -2,34 +2,40 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -pub use icu_datagen::options::*; -pub use icu_datagen::{CollationHanDatabase, TrieType}; - +pub use icu_datagen::{CollationHanDatabase, CoverageLevel, FallbackMode, TrieType}; +pub use icu_locid::LanguageIdentifier; use icu_provider::prelude::*; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::path::{Path, PathBuf}; #[derive(Debug, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "camelCase")] pub struct Config { - #[serde(default, skip_serializing_if = "is_default")] pub keys: KeyInclude, - #[serde(default, skip_serializing_if = "is_default")] + pub fallback: FallbackMode, pub locales: LocaleInclude, + #[serde( + default, + skip_serializing_if = "is_default", + serialize_with = "sorted_set" + )] + pub collations: HashSet, + #[serde(default, skip_serializing_if = "is_default")] + pub segmenter_models: SegmenterModelInclude, + + #[serde(default)] pub cldr: PathOrTag, + #[serde(default)] pub icu_export: PathOrTag, + #[serde(default)] pub segmenter_lstm: PathOrTag, #[serde(default, skip_serializing_if = "is_default")] pub trie_type: TrieType, #[serde(default, skip_serializing_if = "is_default")] pub collation_han_database: CollationHanDatabase, - #[serde(default, skip_serializing_if = "is_default")] - pub collations: HashSet, - #[serde(default, skip_serializing_if = "is_default")] - pub segmenter_models: SegmenterModelInclude, + pub export: Export, #[serde(default, skip_serializing_if = "is_default")] - pub fallback: FallbackMode, - #[serde(default, skip_serializing_if = "is_default")] pub overwrite: bool, } @@ -39,6 +45,7 @@ fn is_default(value: &T) -> bool { #[non_exhaustive] #[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "camelCase")] pub enum KeyInclude { None, All, @@ -46,12 +53,6 @@ pub enum KeyInclude { ForBinary(PathBuf), } -impl Default for KeyInclude { - fn default() -> Self { - Self::All - } -} - mod data_key_as_str { use super::*; use serde::{de::*, ser::*}; @@ -61,7 +62,7 @@ mod data_key_as_str { selff .iter() .map(|k| k.path().get()) - .collect::>() + .collect::>() .serialize(ser) } @@ -74,17 +75,53 @@ mod data_key_as_str { } } -#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum LocaleInclude { + Recommended, + All, + None, + Explicit(#[serde(serialize_with = "sorted_set")] HashSet), + CldrSet(#[serde(serialize_with = "sorted_set")] HashSet), +} + +pub fn sorted_set( + selff: &HashSet, + ser: S, +) -> Result { + use serde::Serialize; + let mut sorted = selff.iter().collect::>(); + sorted.sort_by_key(|l| format!("{l:?}")); + sorted.serialize(ser) +} + +#[non_exhaustive] +#[derive(Debug, PartialEq, Clone, serde::Serialize, serde::Deserialize, Default)] +#[serde(rename_all = "camelCase")] +pub enum SegmenterModelInclude { + #[default] + /// Set this data driver to generate the recommended set of segmenter models. This will cover + /// all languages supported by ICU4X: Thai, Burmese, Khmer, Lao, Chinese, and Japanese. + /// Both dictionary and LSTM models will be included, to the extent required by the chosen data keys. + Recommended, + None, + Explicit(Vec), +} + +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Default)] +#[serde(rename_all = "camelCase")] pub enum PathOrTag { Path(PathBuf), Tag(String), + #[default] Latest, None, } #[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] pub enum Export { - Fs { + FileSystem { path: PathBuf, syntax: FsSyntax, #[serde(default, skip_serializing_if = "is_default")] @@ -97,15 +134,20 @@ pub enum Export { path: PathBuf, #[serde(default, skip_serializing_if = "is_default")] pretty: bool, - #[serde(default, skip_serializing_if = "is_default")] + #[serde( + default, + skip_serializing_if = "is_default", + rename = "useSeparateCrates" + )] use_separate_crates: bool, - #[doc(hidden)] + #[doc(hidden)] // we don't want this on the JSON API, but the CLI API goes through this struct #[serde(default, skip_serializing, skip_deserializing)] insert_feature_gates: bool, }, } #[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] pub enum FsSyntax { Postcard, Json, diff --git a/provider/datagen/src/bin/datagen/mod.rs b/provider/datagen/src/bin/datagen/mod.rs index 7060f35c4b1..8108e08fe5f 100644 --- a/provider/datagen/src/bin/datagen/mod.rs +++ b/provider/datagen/src/bin/datagen/mod.rs @@ -35,68 +35,92 @@ fn main() -> eyre::Result<()> { let config = matches.as_config()?; - let mut options = options::Options::default(); - options.keys = match config.keys { - config::KeyInclude::None => Default::default(), - config::KeyInclude::All => icu_datagen::all_keys().into_iter().collect(), - config::KeyInclude::Explicit(set) => set, - config::KeyInclude::ForBinary(path) => { - icu_datagen::keys_from_bin(path)?.into_iter().collect() - } - }; - options.locales = config.locales; - options.collations = config.collations; - options.segmenter_models = config.segmenter_models; - options.fallback = config.fallback; - - let mut source_data = SourceData::offline(); - source_data = source_data.with_collation_han_database(config.collation_han_database); + let mut provider = DatagenProvider::default(); + provider = provider.with_collation_han_database(config.collation_han_database); if config.trie_type == crate::config::TrieType::Fast { - source_data = source_data.with_fast_tries(); + provider = provider.with_fast_tries(); } - source_data = match config.cldr { - config::PathOrTag::Path(path) => source_data.with_cldr(path, Default::default())?, + provider = match config.cldr { + config::PathOrTag::Path(path) => provider.with_cldr(path)?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG, Default::default())? + provider.with_cldr_for_tag(DatagenProvider::LATEST_TESTED_CLDR_TAG) } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_cldr_for_tag(&tag, Default::default())?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => provider.with_cldr_for_tag(&tag), + config::PathOrTag::None => provider, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - source_data = match config.icu_export { - config::PathOrTag::Path(path) => source_data.with_icuexport(path)?, + provider = match config.icu_export { + config::PathOrTag::Path(path) => provider.with_icuexport(path)?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)? + provider.with_icuexport_for_tag(DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG) } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_icuexport_for_tag(&tag)?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => provider.with_icuexport_for_tag(&tag), + config::PathOrTag::None => provider, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - source_data = match config.segmenter_lstm { - config::PathOrTag::Path(path) => source_data.with_icuexport(path)?, + provider = match config.segmenter_lstm { + config::PathOrTag::Path(path) => provider.with_icuexport(path)?, #[cfg(feature = "networking")] config::PathOrTag::Latest => { - source_data.with_segmenter_lstm_for_tag(SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG)? + provider.with_segmenter_lstm_for_tag(DatagenProvider::LATEST_TESTED_SEGMENTER_LSTM_TAG) } #[cfg(feature = "networking")] - config::PathOrTag::Tag(tag) => source_data.with_segmenter_lstm_for_tag(&tag)?, - config::PathOrTag::None => source_data, + config::PathOrTag::Tag(tag) => provider.with_segmenter_lstm_for_tag(&tag), + config::PathOrTag::None => provider, #[cfg(not(feature = "networking"))] _ => eyre::bail!("Download data from tags requires the `networking` Cargo feature"), }; - let provider = DatagenProvider::new(source_data); + let mut driver = DatagenDriver::new(); + driver = match config.keys { + config::KeyInclude::None => driver.with_keys([]), + config::KeyInclude::All => driver.with_keys(icu_datagen::all_keys()), + config::KeyInclude::Explicit(set) => driver.with_keys(set), + config::KeyInclude::ForBinary(path) => driver.with_keys(icu_datagen::keys_from_bin(path)?), + }; + driver = driver.with_fallback_mode(config.fallback); + driver = driver.with_collations(config.collations); + driver = match config.locales { + config::LocaleInclude::All => driver.with_all_locales(), + config::LocaleInclude::None => driver.with_locales([]), + config::LocaleInclude::Explicit(set) => driver.with_locales(set), + config::LocaleInclude::CldrSet(levels) => { + driver.with_locales(provider.locales_for_coverage_levels(levels.iter().copied())?) + } + config::LocaleInclude::Recommended => { + driver.with_locales(provider.locales_for_coverage_levels([ + CoverageLevel::Modern, + CoverageLevel::Moderate, + CoverageLevel::Basic, + ])?) + } + }; + driver = match config.segmenter_models { + config::SegmenterModelInclude::None => driver.with_segmenter_models([]), + config::SegmenterModelInclude::Recommended => driver.with_segmenter_models([ + "Burmese_codepoints_exclusive_model4_heavy".into(), + "burmesedict".into(), + "cjdict".into(), + "Khmer_codepoints_exclusive_model4_heavy".into(), + "khmerdict".into(), + "Lao_codepoints_exclusive_model4_heavy".into(), + "laodict".into(), + "Thai_codepoints_exclusive_model4_heavy".into(), + "thaidict".into(), + ]), + config::SegmenterModelInclude::Explicit(models) => driver.with_segmenter_models(models), + }; match config.export { - config::Export::Fs { + config::Export::FileSystem { path, syntax, fingerprint, @@ -126,7 +150,7 @@ fn main() -> eyre::Result<()> { options }, )?; - Ok(provider.export(options, exporter)?) + Ok(driver.export(&provider, exporter)?) } } config::Export::Blob { ref path } => { @@ -146,7 +170,7 @@ fn main() -> eyre::Result<()> { ) }, ); - Ok(provider.export(options, exporter)?) + Ok(driver.export(&provider, exporter)?) } } config::Export::Baked { @@ -172,7 +196,7 @@ fn main() -> eyre::Result<()> { options })?; - Ok(provider.export(options, exporter)?) + Ok(driver.export(&provider, exporter)?) } } } diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs new file mode 100644 index 00000000000..fa20af4c05a --- /dev/null +++ b/provider/datagen/src/driver.rs @@ -0,0 +1,553 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::rayon_prelude::*; +use crate::FallbackMode; +use icu_locid::extensions::unicode::key; +use icu_locid::LanguageIdentifier; +use icu_locid_transform::fallback::LocaleFallbackIterator; +use icu_locid_transform::fallback::LocaleFallbacker; +use icu_provider::datagen::*; +use icu_provider::prelude::*; +use once_cell::sync::Lazy; +use std::borrow::Cow; +use std::collections::HashMap; +use std::collections::HashSet; +use writeable::Writeable; + +/// Configuration for a data export operation. +/// +/// # Examples +/// +/// ```no_run +/// use icu_datagen::prelude::*; +/// use icu_datagen::blob_exporter::*; +/// +/// DatagenDriver::new() +/// .with_keys([icu::list::provider::AndListV1Marker::KEY]) +/// .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(&mut Vec::new()))) +/// .unwrap(); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct DatagenDriver { + keys: HashSet, + // `None` means all + locales: Option>, + fallback: FallbackMode, + collations: HashSet, + segmenter_models: Vec, +} + +impl DatagenDriver { + /// Creates an empty [`DatagenDriver`]. + pub fn new() -> Self { + Self::default() + } + + /// Sets this driver to generate the given keys. See [`icu_datagen::keys`], + /// [`icu_datagen::all_keys`], [`icu_datagen::key`] and [`icu_datagen::keys_from_bin`]. + /// + /// [`icu_datagen::keys`]: crate::keys + /// [`icu_datagen::all_keys`]: crate::all_keys + /// [`icu_datagen::key`]: crate::key + /// [`icu_datagen::keys_from_bin`]: crate::keys_from_bin + pub fn with_keys(self, keys: impl IntoIterator) -> Self { + Self { + keys: keys.into_iter().collect(), + ..self + } + } + + /// Sets the fallback type that the data should be generated for. If locale fallback is + /// used at runtime, smaller data can be generated. + pub fn with_fallback_mode(self, fallback: FallbackMode) -> Self { + Self { fallback, ..self } + } + + /// Sets the locales to generate. + pub fn with_locales(self, locales: impl IntoIterator) -> Self { + Self { + locales: Some(locales.into_iter().collect()), + ..self + } + } + + /// Sets this driver to generate all available locales. + pub fn with_all_locales(self) -> Self { + Self { + locales: None, + ..self + } + } + + /// By default, the collations `big5han`, `gb2312`, and those starting with `search` + /// are excluded. This method can be used to reennable them. + /// + /// The special string `"search*"` causes all search collation tables to be included. + pub fn with_collations(self, collations: impl IntoIterator) -> Self { + Self { + collations: collations.into_iter().collect(), + ..self + } + } + + /// Sets this driver to generate the given segmentation models, to the extent required by the + /// chosen data keys. + /// + /// The currently supported dictionary models are + /// * `cjdict` + /// * `burmesedict` + /// * `khmerdict` + /// * `laodict` + /// * `thaidict` + /// + /// The currently supported LSTM models are + /// * `Burmese_codepoints_exclusive_model4_heavy` + /// * `Khmer_codepoints_exclusive_model4_heavy` + /// * `Lao_codepoints_exclusive_model4_heavy` + /// * `Thai_codepoints_exclusive_model4_heavy` + /// + /// If a model is not included, the resulting line or word segmenter will apply rule-based + /// segmentation when encountering text in a script that requires the model, which will be + /// incorrect. + /// + /// If multiple models for the same language and segmentation type (dictionary/LSTM) are + /// listed, the first one will be used. + pub fn with_segmenter_models(self, models: impl IntoIterator) -> Self { + Self { + segmenter_models: models.into_iter().collect(), + ..self + } + } + + /// Exports data from the given provider to the given exporter. + /// + /// See + /// [`BlobExporter`](icu_provider_blob::export), + /// [`FileSystemExporter`](icu_provider_fs::export), + /// and [`BakedExporter`](crate::baked_exporter). + pub fn export( + self, + provider: &impl ExportableProvider, + mut sink: impl DataExporter, + ) -> Result<(), DataError> { + self.export_dyn(provider, &mut sink) + } + + // Avoids multiple monomorphizations + fn export_dyn( + mut self, + provider: &impl ExportableProvider, + sink: &mut dyn DataExporter, + ) -> Result<(), DataError> { + if self.keys.is_empty() { + log::warn!("No keys selected"); + } + + if matches!(self.fallback, FallbackMode::Preresolved) && self.locales.is_none() { + return Err(DataError::custom( + "FallbackMode::Preresolved requires an explicit locale set", + )); + } + + self.fallback = match self.fallback { + FallbackMode::PreferredForExporter => { + if sink.supports_built_in_fallback() { + FallbackMode::Runtime + } else { + FallbackMode::Hybrid + } + } + f => f, + }; + + log::info!( + "Datagen configured with fallback mode {:?} and these locales: {}", + self.fallback, + match self.locales { + None => "ALL".to_string(), + Some(ref set) => { + let mut list: Vec> = + set.iter().map(Writeable::write_to_string).collect(); + list.sort(); + format!("{:?}", list) + } + } + ); + + let fallbacker = once_cell::sync::Lazy::new(|| { + LocaleFallbacker::try_new_unstable(&provider.as_downcasting()) + }); + + let load_with_fallback = |key, locale: &_| { + log::trace!("Generating key/locale: {key}/{locale:}"); + let mut metadata = DataRequestMetadata::default(); + metadata.silent = true; + // Lazy-compute the fallback iterator so that we don't always require CLDR data + let mut locale_iter: Option = None; + loop { + let req = DataRequest { + locale: locale_iter.as_ref().map(|i| i.get()).unwrap_or(locale), + metadata, + }; + match provider.load_data(key, req) { + Ok(data_response) => { + if let Some(iter) = locale_iter.as_ref() { + if iter.get().is_empty() && !locale.is_empty() { + log::debug!("Falling back to und: {key}/{locale}"); + } + } + return Some( + data_response + .take_payload() + .map_err(|e| e.with_req(key, req)), + ); + } + Err(DataError { + kind: DataErrorKind::MissingLocale, + .. + }) => { + if let Some(iter) = locale_iter.as_mut() { + if iter.get().is_empty() { + log::debug!("Could not find data for: {key}/{locale}"); + return None; + } + iter.step(); + } else { + match fallbacker.as_ref() { + Ok(fallbacker) => { + locale_iter = Some( + fallbacker + .for_config(key.fallback_config()) + .fallback_for(locale.clone()), + ) + } + Err(e) => return Some(Err(*e)), + } + } + } + Err(e) => return Some(Err(e.with_req(key, req))), + } + } + }; + + self.keys.clone().into_par_iter().try_for_each(|key| { + log::info!("Generating key {key}"); + + if key.metadata().singleton { + let payload = provider + .load_data(key, Default::default()) + .and_then(DataResponse::take_payload) + .map_err(|e| e.with_req(key, Default::default()))?; + + return sink + .flush_singleton(key, &payload) + .map_err(|e| e.with_req(key, Default::default())); + } + + let locales_to_export = self.select_locales_for_key(provider, key, &fallbacker)?; + + match self.fallback { + FallbackMode::Runtime | FallbackMode::RuntimeManual => { + let payloads = locales_to_export + .into_par_iter() + .filter_map(|locale| { + load_with_fallback(key, &locale) + .map(|r| r.map(|payload| (locale, payload))) + }) + .collect::, _>>()?; + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); + payloads.iter().try_for_each(|(locale, payload)| { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + iter.step(); + if payloads.get(iter.get()) == Some(payload) { + // Found a match: don't need to write anything + log::trace!( + "Deduplicating {key}/{locale} (inherits from {})", + iter.get() + ); + return Ok(()); + } + } + // Did not find a match: export this payload + sink.put_payload(key, locale, payload).map_err(|e| { + e.with_req( + key, + DataRequest { + locale, + metadata: Default::default(), + }, + ) + }) + })? + } + FallbackMode::Hybrid | FallbackMode::Preresolved => { + locales_to_export.into_par_iter().try_for_each(|locale| { + if let Some(payload) = load_with_fallback(key, &locale) { + sink.put_payload(key, &locale, &payload?) + } else { + Ok(()) + } + .map_err(|e| { + e.with_req( + key, + DataRequest { + locale: &locale, + metadata: Default::default(), + }, + ) + }) + })? + } + FallbackMode::PreferredForExporter => unreachable!("resolved"), + }; + + if self.fallback == FallbackMode::Runtime { + sink.flush_with_built_in_fallback(key, BuiltInFallbackMode::Standard) + } else { + sink.flush(key) + } + .map_err(|e| e.with_key(key)) + })?; + + sink.close() + } + + /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen + /// provider's options bag. The locales may be later optionally deduplicated for fallback. + fn select_locales_for_key( + &self, + provider: &impl ExportableProvider, + key: DataKey, + fallbacker: &Lazy< + Result, + impl FnOnce() -> Result, + >, + ) -> Result, DataError> { + let mut locales = provider + .supported_locales_for_key(key) + .map_err(|e| e.with_key(key))? + .into_iter() + .collect::>(); + + if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY + || key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY + { + locales.retain(|locale| { + let model = + crate::transform::segmenter::dictionary::data_locale_to_model_name(locale); + self.segmenter_models + .iter() + .any(|m| Some(m.as_ref()) == model) + }); + // Don't perform additional locale filtering + return Ok(locales); + } else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY { + locales.retain(|locale| { + let model = crate::transform::segmenter::lstm::data_locale_to_model_name(locale); + self.segmenter_models + .iter() + .any(|m| Some(m.as_ref()) == model) + }); + // Don't perform additional locale filtering + return Ok(locales); + } else if key == icu_collator::provider::CollationDataV1Marker::KEY + || key == icu_collator::provider::CollationDiacriticsV1Marker::KEY + || key == icu_collator::provider::CollationJamoV1Marker::KEY + || key == icu_collator::provider::CollationMetadataV1Marker::KEY + || key == icu_collator::provider::CollationReorderingV1Marker::KEY + || key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY + { + locales.retain(|locale| { + let Some(collation) = locale + .get_unicode_ext(&key!("co")) + .and_then(|co| co.as_single_subtag().copied()) + else { return true }; + self.collations.contains(collation.as_str()) + || if collation.starts_with("search") { + self.collations.contains("search*") + } else { + !["big5han", "gb2312"].contains(&collation.as_str()) + } + }); + } + + locales = match (&self.locales, self.fallback) { + // Case 1: `None` simply exports all supported locales for this key. + (None, _) => locales, + // Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches + // one of the explicit locales. This ensures extensions are included. In addition, any + // explicit locales are added to the list, even if they themselves don't contain data; + // fallback should be performed upon exporting. + (Some(explicit), FallbackMode::Preresolved) => locales + .into_iter() + .chain(explicit.iter().map(|langid| langid.into())) + .filter(|locale| explicit.contains(&locale.get_langid())) + .collect(), + // Case 3: All other modes resolve to the "ancestors and descendants" strategy. + (Some(explicit), _) => { + let include_und = explicit.contains(&LanguageIdentifier::UND); + let explicit: HashSet = explicit.iter().map(DataLocale::from).collect(); + let mut implicit = HashSet::new(); + // TODO: Make including the default locale configurable + implicit.insert(DataLocale::default()); + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); + + for locale in explicit.iter() { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + implicit.insert(iter.get().clone()); + iter.step(); + } + } + + locales + .into_iter() + .chain(explicit.iter().cloned()) + .filter(|locale| { + if implicit.contains(locale) { + return true; + } + if explicit.contains(locale) { + return true; + } + if locale.is_langid_und() && include_und { + return true; + } + if locale.language().is_empty() + && matches!( + key.fallback_config().priority, + icu_provider::FallbackPriority::Region + ) + { + return true; + } + // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it + // This would get caught later on, but it makes datagen faster and quieter to catch it here + if key + == icu_datetime::provider::calendar::DateSkeletonPatternsV1Marker::KEY + && !locale.has_unicode_ext() + { + return false; + } + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + if explicit.contains(iter.get()) { + return true; + } + iter.step(); + } + log::trace!("Filtered out: {key}/{locale}"); + false + }) + .collect() + } + }; + + Ok(locales) + } +} + +#[test] +fn test_collation_filtering() { + use icu_locid::langid; + use std::collections::BTreeSet; + + #[derive(Debug)] + struct TestCase<'a> { + include_collations: &'a [&'a str], + language: LanguageIdentifier, + expected: &'a [&'a str], + } + let cases = [ + TestCase { + include_collations: &[], + language: langid!("zh"), + expected: &["zh", "zh-u-co-stroke", "zh-u-co-unihan", "zh-u-co-zhuyin"], + }, + TestCase { + include_collations: &["gb2312"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-gb2312", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &["big5han"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-big5han", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &["gb2312", "search*"], + language: langid!("zh"), + expected: &[ + "zh", + "zh-u-co-gb2312", + "zh-u-co-stroke", + "zh-u-co-unihan", + "zh-u-co-zhuyin", + ], + }, + TestCase { + include_collations: &[], + language: langid!("ko"), + expected: &["ko", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["searchjl"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search", "searchjl"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + TestCase { + include_collations: &["search*", "big5han"], + language: langid!("ko"), + expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], + }, + ]; + for cas in cases { + let resolved_locales = DatagenDriver::new() + .with_collations(cas.include_collations.iter().copied().map(String::from)) + .with_locales([cas.language.clone()]) + .with_fallback_mode(FallbackMode::Preresolved) + .select_locales_for_key( + &crate::DatagenProvider::latest_tested_offline_subset(), + icu_collator::provider::CollationDataV1Marker::KEY, + &once_cell::sync::Lazy::new(|| unreachable!()), + ) + .unwrap() + .into_iter() + .map(|l| l.to_string()) + .collect::>(); + let expected_locales = cas + .expected + .iter() + .copied() + .map(String::from) + .collect::>(); + assert_eq!(resolved_locales, expected_locales, "{cas:?}"); + } +} diff --git a/provider/datagen/src/error.rs b/provider/datagen/src/error.rs index de3f9652467..d5aea3e36e6 100644 --- a/provider/datagen/src/error.rs +++ b/provider/datagen/src/error.rs @@ -10,6 +10,9 @@ pub(crate) const MISSING_CLDR_ERROR: DataError = pub(crate) const MISSING_ICUEXPORT_ERROR: DataError = DataErrorKind::MissingSourceData.with_str_context("icuexport"); +pub(crate) const MISSING_SEGMENTER_LSTM_ERROR: DataError = + DataErrorKind::MissingSourceData.with_str_context("segmenter"); + /// Identifies errors that are due to missing CLDR data. pub fn is_missing_cldr_error(mut e: DataError) -> bool { e.key = None; @@ -21,3 +24,9 @@ pub fn is_missing_icuexport_error(mut e: DataError) -> bool { e.key = None; e == MISSING_ICUEXPORT_ERROR } + +/// Identifies errors that are due to missing segmenter LSTM data. +pub fn is_missing_segmenter_lstm_error(mut e: DataError) -> bool { + e.key = None; + e == MISSING_SEGMENTER_LSTM_ERROR +} diff --git a/provider/datagen/src/lib.rs b/provider/datagen/src/lib.rs index e45e9e11294..5319bec8440 100644 --- a/provider/datagen/src/lib.rs +++ b/provider/datagen/src/lib.rs @@ -17,21 +17,13 @@ //! //! ```no_run //! use icu_datagen::prelude::*; -//! use icu_provider_blob::export::*; +//! use icu_datagen::blob_exporter::*; //! use std::fs::File; //! -//! fn main() { -//! DatagenProvider::default() -//! .export( -//! { -//! let mut options = options::Options::default(); -//! options.keys = [icu::list::provider::AndListV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap())), -//! ) -//! .unwrap(); -//! } +//! DatagenDriver::new() +//! .with_keys([icu::list::provider::AndListV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), BlobExporter::new_with_sink(Box::new(File::create("data.postcard").unwrap()))) +//! .unwrap(); //! ``` //! //! ## Command line @@ -68,18 +60,22 @@ )] #![warn(missing_docs)] +mod driver; mod error; +mod provider; mod registry; mod source; mod transform; -pub use error::{is_missing_cldr_error, is_missing_icuexport_error}; +pub use driver::DatagenDriver; +pub use error::{ + is_missing_cldr_error, is_missing_icuexport_error, is_missing_segmenter_lstm_error, +}; +pub use provider::DatagenProvider; +#[doc(hidden)] // for CLI serde +pub use provider::TrieType; #[allow(deprecated)] // ugh pub use registry::{all_keys, all_keys_with_experimental, deserialize_and_measure, key}; -pub use source::CollationHanDatabase; -pub use source::SourceData; -#[doc(hidden)] // for CLI serde -pub use source::TrieType; #[cfg(feature = "provider_baked")] pub mod baked_exporter; @@ -88,12 +84,12 @@ pub use icu_provider_blob::export as blob_exporter; #[cfg(feature = "provider_fs")] pub use icu_provider_fs::export as fs_exporter; -pub mod options; - /// A prelude for using the datagen API pub mod prelude { #[doc(no_inline)] - pub use crate::{options, DatagenProvider, SourceData}; + pub use crate::{ + CollationHanDatabase, CoverageLevel, DatagenDriver, DatagenProvider, FallbackMode, + }; #[doc(no_inline)] pub use icu_locid::{langid, LanguageIdentifier}; #[doc(no_inline)] @@ -101,30 +97,13 @@ pub mod prelude { // SEMVER GRAVEYARD #[cfg(feature = "legacy_api")] - #[doc(hidden)] - pub use crate::options::CoverageLevel; - #[cfg(feature = "legacy_api")] - #[doc(hidden)] - pub use crate::source::CollationHanDatabase; - #[cfg(feature = "legacy_api")] #[allow(deprecated)] #[doc(hidden)] - pub use crate::{syntax, BakedOptions, CldrLocaleSubset, Out}; + pub use crate::{syntax, BakedOptions, CldrLocaleSubset, Out, SourceData}; } -use icu_locid::LanguageIdentifier; -use icu_locid_transform::fallback::LocaleFallbackIterator; -use icu_locid_transform::fallback::LocaleFallbacker; -use icu_provider::datagen::*; use icu_provider::prelude::*; -use memchr::memmem; -use once_cell::sync::Lazy; -use options::{FallbackMode, LocaleInclude}; -use std::borrow::Cow; -use std::collections::HashMap; -use std::collections::HashSet; use std::path::Path; -use writeable::Writeable; #[cfg(feature = "rayon")] pub(crate) use rayon::prelude as rayon_prelude; @@ -139,400 +118,139 @@ pub(crate) mod rayon_prelude { impl IntoParallelIterator for T {} } -/// [`DataProvider`] backed by [`SourceData`] +/// Defines how fallback will apply to the generated data. /// -/// If `source` does not contain a specific data source, `DataProvider::load` will -/// error ([`is_missing_cldr_error`](crate::is_missing_cldr_error) / -/// [`is_missing_icuexport_error`](crate::is_missing_icuexport_error)) if the data is -/// required for that key. -#[derive(Debug, Clone)] -#[cfg_attr(feature = "networking", derive(Default))] -#[cfg_attr(not(doc), allow(clippy::exhaustive_structs))] -#[cfg_attr(doc, non_exhaustive)] -pub struct DatagenProvider { - #[doc(hidden)] - pub source: SourceData, +/// If in doubt, use [`FallbackMode::PreferredForExporter`], which selects the best mode for your +/// chosen data provider. +/// +/// # Fallback Mode Comparison +/// +/// The modes differ primarily in their approaches to runtime fallback and data size. +/// +/// | Mode | Runtime Fallback | Data Size | +/// |---|---|---| +/// | [`Runtime`] | Yes, Automatic | Smallest | +/// | [`RuntimeManual`] | Yes, Manual | Smallest | +/// | [`Preresolved`] | No | Small | +/// | [`Hybrid`] | Optional | Medium | +/// +/// If you are not 100% certain of the closed set of locales you need at runtime, you should +/// use a provider with runtime fallback enabled. +/// +/// [`Runtime`]: FallbackMode::Runtime +/// [`RuntimeManual`]: FallbackMode::RuntimeManual +/// [`Preresolved`]: FallbackMode::Preresolved +/// [`Hybrid`]: FallbackMode::Hybrid +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] +#[serde(rename_all = "camelCase")] +pub enum FallbackMode { + /// Selects the fallback mode based on [`DataExporter::supports_built_in_fallback()`]( + /// icu_provider::datagen::DataExporter::supports_built_in_fallback()), resolving to either + /// [`Runtime`] or [`Hybrid`]. + /// + /// [`Runtime`]: Self::Runtime + /// [`Hybrid`]: Self::Hybrid + #[default] + PreferredForExporter, + /// This mode generates the minimal set of locales that cover the requested locales when + /// fallback is used at runtime. For example, if "en" and "en-US" are both requested but + /// they contain the same value, only "en" will be included, since "en-US" falls back to + /// "en" at runtime. + /// + /// If an explicit list of locales is used, this mode includes all ancestors and descendants + /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is + /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. Note that the + /// children of "pt-PT" usually inherit from it and therefore don't take up a significant + /// amount of space in the data file. + /// + /// This mode is only supported with the baked data provider, and it builds fallback logic + /// into the generated code. To use this mode with other providers that don't bundle fallback + /// logic, use [`FallbackMode::RuntimeManual`] or [`FallbackMode::Hybrid`]. + /// + /// This is the default fallback mode for the baked provider. + Runtime, + /// Same as [`FallbackMode::Runtime`] except that the fallback logic is not included in the + /// generated code. It must be enabled manually with a [`LocaleFallbackProvider`]. + /// + /// This mode is supported on all data provider implementations. + /// + /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider + RuntimeManual, + /// This mode generates data for exactly the supplied locales. If data doesn't exist for a + /// locale, fallback will be performed and the fallback value will be exported. + /// + /// Requires using an explicit list of locales. + /// + /// Note: in data exporters that deduplicate values (such as `BakedExporter` and + /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] + /// is limited to the pointers in the explicitly listed locales. + /// + /// Data generated in this mode can be used without runtime fallback and guarantees that all + /// locales are present. If you wish to also support locales that were not explicitly listed + /// with runtime fallback, see [`FallbackMode::Hybrid`]. + Preresolved, + /// This mode passes through CLDR data without performing locale deduplication. + /// + /// If an explicit list of locales is used, this mode includes all ancestors and descendants + /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is + /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. + /// + /// Note: in data exporters that deduplicate values (such as `BakedExporter` and + /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] + /// is limited to the pointers in the explicitly listed locales. + /// + /// Data generated in this mode is suitable for use with or without runtime fallback. To + /// enable runtime fallback, use a [`LocaleFallbackProvider`]. + /// + /// This is the default fallback mode for the blob and filesystem providers. + /// + /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider + Hybrid, } -impl DatagenProvider { - /// Creates a new data provider with the given `source`. - pub fn new(source: SourceData) -> Self { - Self { source } - } - - #[cfg(test)] - pub fn for_test() -> Self { - use once_cell::sync::OnceCell; - - static TEST_PROVIDER: OnceCell = OnceCell::new(); - // Singleton so that all instantiations share the same cache. - TEST_PROVIDER - .get_or_init(|| { - let data_root = - std::path::Path::new(core::env!("CARGO_MANIFEST_DIR")).join("tests/data"); - DatagenProvider { - // This is equivalent to `latest_tested` for the files defined in - // `tools/testdata-scripts/globs.rs.data`. - source: SourceData::offline() - .with_cldr(data_root.join("cldr"), Default::default()) - .unwrap() - .with_icuexport(data_root.join("icuexport")) - .unwrap(), - } - }) - .clone() - } - - /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen - /// provider's options bag. The locales may be later optionally deduplicated for fallback. - pub(crate) fn select_locales_for_key( - &self, - key: DataKey, - options: &options::Options, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, - ) -> Result, DataError> { - let mut locales = self - .supported_locales_for_key(key) - .map_err(|e| e.with_key(key))? - .into_iter() - .collect::>(); - - if key == icu_segmenter::provider::DictionaryForWordOnlyAutoV1Marker::KEY - || key == icu_segmenter::provider::DictionaryForWordLineExtendedV1Marker::KEY - { - // Segmenter: filter only by segmenter_models - return Ok(transform::segmenter::dictionary::filter_data_locales( - locales, - &options.segmenter_models, - )); - } else if key == icu_segmenter::provider::LstmForWordLineAutoV1Marker::KEY { - // Segmenter: filter only by segmenter_models - return Ok(transform::segmenter::lstm::filter_data_locales( - locales, - &options.segmenter_models, - )); - } else if key == icu_collator::provider::CollationDataV1Marker::KEY - || key == icu_collator::provider::CollationDiacriticsV1Marker::KEY - || key == icu_collator::provider::CollationJamoV1Marker::KEY - || key == icu_collator::provider::CollationMetadataV1Marker::KEY - || key == icu_collator::provider::CollationReorderingV1Marker::KEY - || key == icu_collator::provider::CollationSpecialPrimariesV1Marker::KEY - { - // Collator: filter by collations, but also by locales/fallback - locales = - transform::icuexport::collator::filter_data_locales(locales, &options.collations); - } - - locales = match (&options.locales, options.fallback) { - // Case 1: `LocaleInclude::All` simply exports all supported locales for this key. - (LocaleInclude::All, _) => locales, - // Case 2: `FallbackMode::Preresolved` exports all supported locales whose langid matches - // one of the explicit locales. This ensures extensions are included. In addition, any - // explicit locales are added to the list, even if they themselves don't contain data; - // fallback should be performed upon exporting. - (LocaleInclude::Explicit(explicit), FallbackMode::Preresolved) => locales - .into_iter() - .chain(explicit.iter().map(|langid| langid.into())) - .filter(|locale| explicit.contains(&locale.get_langid())) - .collect(), - // Case 3: All other modes resolve to the "ancestors and descendants" strategy. - (LocaleInclude::Explicit(explicit), _) => { - let include_und = explicit.contains(&LanguageIdentifier::UND); - let explicit: HashSet = explicit.iter().map(DataLocale::from).collect(); - let mut implicit = HashSet::new(); - // TODO: Make including the default locale configurable - implicit.insert(DataLocale::default()); - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - - for locale in explicit.iter() { - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - implicit.insert(iter.get().clone()); - iter.step(); - } - } - - locales - .into_iter() - .chain(explicit.iter().cloned()) - .filter(|locale| { - if implicit.contains(locale) { - return true; - } - if explicit.contains(locale) { - return true; - } - if locale.is_langid_und() && include_und { - return true; - } - if locale.language().is_empty() - && matches!( - key.fallback_config().priority, - icu_provider::FallbackPriority::Region - ) - { - return true; - } - // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it - // This would get caught later on, but it makes datagen faster and quieter to catch it here - if key - == icu_datetime::provider::calendar::DateSkeletonPatternsV1Marker::KEY - && !locale.has_unicode_ext() - { - return false; - } - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - if explicit.contains(iter.get()) { - return true; - } - iter.step(); - } - log::trace!("Filtered out: {key}/{locale}"); - false - }) - .collect() - } - _ => unreachable!("Pre-processed LocaleInclude has only 2 variants"), - }; - - Ok(locales) - } +/// Specifies the collation Han database to use. +/// +/// Unihan is more precise but significantly increases data size. See +/// +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] +#[non_exhaustive] +pub enum CollationHanDatabase { + /// Implicit + #[serde(rename = "implicit")] + #[default] + Implicit, + /// Unihan + #[serde(rename = "unihan")] + Unihan, +} - /// Loads a `DataPayload` with locale fallback enabled. - fn load_with_fallback( - &self, - key: DataKey, - locale: &DataLocale, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, - ) -> Result>, DataError> { - log::trace!("Generating key/locale: {key}/{locale:}"); - let mut metadata = DataRequestMetadata::default(); - metadata.silent = true; - // Lazy-compute the fallback iterator so that we don't always require CLDR data - let mut option_iter: Option = None; - loop { - let req = DataRequest { - locale: match option_iter.as_ref() { - Some(iter) => iter.get(), - None => locale, - }, - metadata, - }; - let result = self.load_data(key, req); - match result { - Ok(data_response) => { - if let Some(iter) = option_iter.as_ref() { - if iter.get().is_empty() && !locale.is_empty() { - log::debug!("Falling back to und: {key}/{locale}"); - } - } - return Ok(Some(data_response.take_payload()?)); - } - Err(DataError { - kind: DataErrorKind::MissingLocale, - .. - }) => { - if let Some(iter) = option_iter.as_mut() { - if iter.get().is_empty() { - log::debug!("Could not find data for: {key}/{locale}"); - return Ok(None); - } - iter.step(); - } else { - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let iter = fallbacker - .for_config(key.fallback_config()) - .fallback_for(locale.clone()); - option_iter.replace(iter); - } - } - Err(e) => return Err(e.with_req(key, req)), - } +impl std::fmt::Display for CollationHanDatabase { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + CollationHanDatabase::Implicit => write!(f, "implicithan"), + CollationHanDatabase::Unihan => write!(f, "unihan"), } } +} - /// Exports data for the given options to the given exporter. +/// A language's CLDR coverage level. +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] +#[non_exhaustive] +#[serde(rename_all = "camelCase")] +pub enum CoverageLevel { + /// Locales listed as modern coverage targets by the CLDR subcomittee. /// - /// See - /// [`BlobExporter`](icu_provider_blob::export), - /// [`FileSystemExporter`](icu_provider_fs::export), - /// and [`BakedExporter`](crate::baked_exporter). - pub fn export( - &self, - mut options: options::Options, - mut exporter: impl DataExporter, - ) -> Result<(), DataError> { - if options.keys.is_empty() { - log::warn!("No keys selected"); - } - - if !self.source.collations.is_empty() - && options.collations - != self - .source - .collations - .iter() - .cloned() - .collect::>() - { - log::warn!("SourceData::with_collations was used and differs from Options#collations (which will be used).") - } - - if matches!(options.fallback, options::FallbackMode::Preresolved) - && !matches!(options.locales, options::LocaleInclude::Explicit(_)) - { - return Err(DataError::custom( - "FallbackMode::Preresolved requires LocaleInclude::Explicit", - )); - } - - options.locales = match core::mem::take(&mut options.locales) { - options::LocaleInclude::None => options::LocaleInclude::Explicit(Default::default()), - options::LocaleInclude::CldrSet(levels) => options::LocaleInclude::Explicit( - self.source - .locales(levels.iter().copied().collect::>().as_slice())? - .into_iter() - .chain(core::iter::once(LanguageIdentifier::UND)) - .collect(), - ), - options::LocaleInclude::Explicit(set) => options::LocaleInclude::Explicit(set), - options::LocaleInclude::All => options::LocaleInclude::All, - options::LocaleInclude::Recommended => options::LocaleInclude::Explicit( - self.source - .locales(&[ - options::CoverageLevel::Modern, - options::CoverageLevel::Moderate, - options::CoverageLevel::Basic, - ])? - .into_iter() - .chain(core::iter::once(LanguageIdentifier::UND)) - .collect(), - ), - }; - - options.fallback = match options.fallback { - options::FallbackMode::PreferredForExporter => { - if exporter.supports_built_in_fallback() { - options::FallbackMode::Runtime - } else { - options::FallbackMode::Hybrid - } - } - f => f, - }; - - log::info!( - "Datagen configured with fallback mode {:?} and these locales: {}", - options.fallback, - match options.locales { - options::LocaleInclude::All => "ALL".to_string(), - options::LocaleInclude::Explicit(ref set) => { - let mut list: Vec> = - set.iter().map(Writeable::write_to_string).collect(); - list.sort(); - format!("{:?}", list) - } - _ => unreachable!(), - } - ); - - // Avoid multiple monomorphizations - fn internal( - provider: &DatagenProvider, - mut options: options::Options, - exporter: &mut dyn DataExporter, - ) -> Result<(), DataError> { - use rayon_prelude::*; - - let fallbacker = - once_cell::sync::Lazy::new(|| LocaleFallbacker::try_new_unstable(provider)); - - core::mem::take(&mut options.keys) - .into_par_iter() - .try_for_each(|key| { - log::info!("Generating key {key}"); - - if key.metadata().singleton { - let payload = provider - .load_data(key, Default::default()) - .and_then(DataResponse::take_payload) - .map_err(|e| e.with_req(key, Default::default()))?; - - return exporter - .flush_singleton(key, &payload) - .map_err(|e| e.with_req(key, Default::default())); - } - - let locales_to_export = - provider.select_locales_for_key(key, &options, &fallbacker)?; - - match options.fallback { - options::FallbackMode::Runtime | options::FallbackMode::RuntimeManual => { - let payloads = locales_to_export - .into_par_iter() - .flat_map(|locale| { - match provider.load_with_fallback(key, &locale, &fallbacker) { - Ok(Some(payload)) => Some(Ok((locale, Box::new(payload)))), - Ok(None) => None, - Err(e) => Some(Err(e)), - } - }) - .collect::, _>>()?; - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = - fallbacker.for_config(key.fallback_config()); - 'outer: for (locale, payload) in payloads.iter() { - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - while !iter.get().is_empty() { - iter.step(); - if let Some(parent_payload) = payloads.get(iter.get()) { - if parent_payload == payload && locale != iter.get() { - // Found a match: don't need to write anything - log::trace!( - "Deduplicating {key}/{locale} (inherits from {})", - iter.get() - ); - continue 'outer; - } - } - } - // Did not find a match: export this payload - exporter.put_payload(key, locale, payload)?; - } - } - options::FallbackMode::Hybrid | options::FallbackMode::Preresolved => { - locales_to_export.into_par_iter().try_for_each(|locale| { - let payload = - provider.load_with_fallback(key, &locale, &fallbacker)?; - if let Some(payload) = payload { - exporter.put_payload(key, &locale, &payload)?; - } - Ok::<(), DataError>(()) - })?; - } - options::FallbackMode::PreferredForExporter => unreachable!("resolved"), - }; - - match options.fallback { - options::FallbackMode::Runtime => exporter - .flush_with_built_in_fallback(key, BuiltInFallbackMode::Standard), - _ => exporter.flush(key), - } - .map_err(|e| e.with_key(key)) - })?; - - exporter.close() - } - internal(self, options, &mut exporter) - } + /// This is the highest level of coverage. + Modern, + /// Locales listed as moderate coverage targets by the CLDR subcomittee. + /// + /// This is a medium level of coverage. + Moderate, + /// Locales listed as basic coverage targets by the CLDR subcomittee. + /// + /// This is the lowest level of coverage. + Basic, } /// Parses a list of human-readable key identifiers and returns a @@ -612,15 +330,17 @@ pub fn keys_from_file>(path: P) -> std::io::Result> /// # } /// ``` pub fn keys_from_bin>(path: P) -> std::io::Result> { + use memchr::memmem::*; + let file = std::fs::read(path.as_ref())?; let file = file.as_slice(); const LEADING_TAG: &[u8] = icu_provider::leading_tag!().as_bytes(); const TRAILING_TAG: &[u8] = icu_provider::trailing_tag!().as_bytes(); - let trailing_tag = memmem::Finder::new(TRAILING_TAG); + let trailing_tag = Finder::new(TRAILING_TAG); - let mut result: Vec = memmem::find_iter(file, LEADING_TAG) + let mut result: Vec = find_iter(file, LEADING_TAG) .map(|tag_position| tag_position + LEADING_TAG.len()) .map(|key_start| &file[key_start..]) .filter_map(move |key_fragment| { @@ -639,13 +359,15 @@ pub fn keys_from_bin>(path: P) -> std::io::Result> { Ok(result) } +#[deprecated(since = "1.3.0", note = "use `DatagenDriver`")] +#[allow(deprecated)] +#[cfg(feature = "legacy_api")] +pub use provider::SourceData; + /// Requires `legacy_api` Cargo feature /// /// The output format. -#[deprecated( - since = "1.3.0", - note = "use `DatagenProvider::export` with self-constructed `DataExporter`s" -)] +#[deprecated(since = "1.3.0", note = "use `DatagenDriver`")] #[non_exhaustive] #[cfg(feature = "legacy_api")] pub enum Out { @@ -723,7 +445,7 @@ impl core::fmt::Debug for Out { } } -#[deprecated(since = "1.3.0", note = "use `DatagenProvider::export`")] +#[deprecated(since = "1.3.0", note = "use `DatagenDriver`")] #[cfg(feature = "legacy_api")] #[allow(deprecated)] /// Requires `legacy_api` Cargo feature @@ -743,90 +465,89 @@ pub fn datagen( source: &SourceData, outs: Vec, ) -> Result<(), DataError> { - use options::*; - - DatagenProvider::new(source.clone()).export( - Options { - keys: keys.iter().cloned().collect(), - locales: locales - .map(|ls| { - LocaleInclude::Explicit( - ls.iter() - .cloned() - .chain([icu_locid::LanguageIdentifier::UND]) - .collect(), - ) - }) - .unwrap_or(options::LocaleInclude::All), - segmenter_models: match locales { - None => options::SegmenterModelInclude::Recommended, - Some(list) => options::SegmenterModelInclude::Explicit({ - let mut models = vec![]; - for locale in list { - let locale = locale.into(); - if let Some(model) = - transform::segmenter::lstm::data_locale_to_model_name(&locale) - { - models.push(model.into()); - } - if let Some(model) = - transform::segmenter::dictionary::data_locale_to_model_name(&locale) - { - models.push(model.into()); - } + let exporter = DatagenDriver::new() + .with_keys(keys.iter().cloned()) + .with_fallback_mode(FallbackMode::Hybrid) + .with_collations(source.collations.clone()); + match locales { + Some(locales) => exporter + .with_locales( + locales + .iter() + .cloned() + .chain([icu_locid::LanguageIdentifier::UND]), + ) + .with_segmenter_models({ + let mut models = vec![]; + for locale in locales { + let locale = locale.into(); + if let Some(model) = + transform::segmenter::lstm::data_locale_to_model_name(&locale) + { + models.push(model.into()); } - models - }), - }, - collations: source.collations.iter().cloned().collect(), - fallback: FallbackMode::Hybrid, + if let Some(model) = + transform::segmenter::dictionary::data_locale_to_model_name(&locale) + { + models.push(model.into()); + } + } + models + }), + _ => exporter.with_all_locales(), + } + .export( + &DatagenProvider { + source: source.clone(), }, - MultiExporter::new( + icu_provider::datagen::MultiExporter::new( outs.into_iter() - .map(|out| -> Result, DataError> { - use baked_exporter::*; - use icu_provider_blob::export::*; - use icu_provider_fs::export::*; - - Ok(match out { - Out::Fs { - output_path, - serializer, - overwrite, - fingerprint, - } => { - let mut options = ExporterOptions::default(); - options.root = output_path; - if overwrite { - options.overwrite = OverwriteOption::RemoveAndReplace + .map( + |out| -> Result, DataError> { + use baked_exporter::*; + use icu_provider_blob::export::*; + use icu_provider_fs::export::*; + + Ok(match out { + Out::Fs { + output_path, + serializer, + overwrite, + fingerprint, + } => { + let mut options = ExporterOptions::default(); + options.root = output_path; + if overwrite { + options.overwrite = OverwriteOption::RemoveAndReplace + } + options.fingerprint = fingerprint; + Box::new(FilesystemExporter::try_new(serializer, options)?) } - options.fingerprint = fingerprint; - Box::new(FilesystemExporter::try_new(serializer, options)?) - } - Out::Blob(write) => Box::new(BlobExporter::new_with_sink(write)), - Out::Baked { - mod_directory, - options, - } => Box::new(BakedExporter::new(mod_directory, options)?), - #[allow(deprecated)] - Out::Module { - mod_directory, - pretty, - insert_feature_gates, - use_separate_crates, - } => Box::new(BakedExporter::new( - mod_directory, - Options { + Out::Blob(write) => Box::new(BlobExporter::new_with_sink(write)), + Out::Baked { + mod_directory, + options, + } => Box::new(BakedExporter::new(mod_directory, options)?), + #[allow(deprecated)] + Out::Module { + mod_directory, pretty, insert_feature_gates, use_separate_crates, - // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; - // 1.1.2 made it an option in Options. - overwrite: false, - }, - )?), - }) - }) + } => Box::new(BakedExporter::new( + mod_directory, + Options { + pretty, + insert_feature_gates, + use_separate_crates, + // Note: overwrite behavior was `true` in 1.0 but `false` in 1.1; + // 1.1.2 made it an option in Options. + overwrite: false, + }, + )?), + }) + }, + ) .collect::>()?, ), ) @@ -892,10 +613,6 @@ fn test_keys_from_bin() { // SEMVER GRAVEYARD -#[cfg(feature = "legacy_api")] -#[doc(hidden)] -pub use source::CoverageLevel; - #[cfg(feature = "legacy_api")] #[doc(hidden)] pub use baked_exporter::Options as BakedOptions; diff --git a/provider/datagen/src/options.rs b/provider/datagen/src/options.rs deleted file mode 100644 index b8f0556ae32..00000000000 --- a/provider/datagen/src/options.rs +++ /dev/null @@ -1,189 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Options bag for [`DatagenProvider`](crate::DatagenProvider). - -pub use crate::transform::cldr::source::CoverageLevel; - -use icu_locid::LanguageIdentifier; -use std::collections::HashSet; - -/// Defines how fallback will apply to the generated data. If in doubt, use -/// [`FallbackMode::PreferredForExporter`], which selects the best mode for your -/// chosen data provider. -/// -/// # Fallback Mode Comparison -/// -/// The modes differ primarily in their approaches to runtime fallback and data size. -/// -/// | Mode | Runtime Fallback | Data Size | -/// |---|---|---| -/// | [`Runtime`] | Yes, Automatic | Smallest | -/// | [`RuntimeManual`] | Yes, Manual | Smallest | -/// | [`Preresolved`] | No | Small | -/// | [`Hybrid`] | Optional | Medium | -/// -/// If you are not 100% certain of the closed set of locales you need at runtime, you should -/// use a provider with runtime fallback enabled. -/// -/// [`Runtime`]: FallbackMode::Runtime -/// [`RuntimeManual`]: FallbackMode::RuntimeManual -/// [`Preresolved`]: FallbackMode::Preresolved -/// [`Hybrid`]: FallbackMode::Hybrid -#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] -#[non_exhaustive] -pub enum FallbackMode { - /// Selects the fallback mode based on [`DataExporter::supports_built_in_fallback()`]( - /// icu_provider::datagen::DataExporter::supports_built_in_fallback()), resolving to either - /// [`Runtime`] or [`Hybrid`]. - /// - /// [`Runtime`]: Self::Runtime - /// [`Hybrid`]: Self::Hybrid - #[default] - PreferredForExporter, - /// This mode generates the minimal set of locales that cover the requested locales when - /// fallback is used at runtime. For example, if "en" and "en-US" are both requested but - /// they contain the same value, only "en" will be included, since "en-US" falls back to - /// "en" at runtime. - /// - /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants - /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is - /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. Note that the - /// children of "pt-PT" usually inherit from it and therefore don't take up a significant - /// amount of space in the data file. - /// - /// This mode is only supported with the baked data provider, and it builds fallback logic - /// into the generated code. To use this mode with other providers that don't bundle fallback - /// logic, use [`FallbackMode::RuntimeManual`] or [`FallbackMode::Hybrid`]. - /// - /// This is the default fallback mode for the baked provider. - Runtime, - /// Same as [`FallbackMode::Runtime`] except that the fallback logic is not included in the - /// generated code. It must be enabled manually with a [`LocaleFallbackProvider`]. - /// - /// This mode is supported on all data provider implementations. - /// - /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider - RuntimeManual, - /// This mode generates data for exactly the supplied locales. If data doesn't exist for a - /// locale, fallback will be performed and the fallback value will be exported. - /// - /// Requires using [`LocaleInclude::Explicit`]. - /// - /// Note: in data exporters that deduplicate values (such as `BakedExporter` and - /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] - /// is limited to the pointers in the explicitly listed locales. - /// - /// Data generated in this mode can be used without runtime fallback and guarantees that all - /// locales are present. If you wish to also support locales that were not explicitly listed - /// with runtime fallback, see [`FallbackMode::Hybrid`]. - Preresolved, - /// This mode passes through CLDR data without performing locale deduplication. - /// - /// If [`LocaleInclude::Explicit`] is used, this mode includes all ancestors and descendants - /// (usually regional variants) of the explicitly listed locales. For example, if "pt-PT" is - /// requested, then "pt", "pt-PT", and children like "pt-MO" will be included. - /// - /// Note: in data exporters that deduplicate values (such as `BakedExporter` and - /// `BlobDataExporter`), the impact on data size as compared to [`FallbackMode::Runtime`] - /// is limited to the pointers in the explicitly listed locales. - /// - /// Data generated in this mode is suitable for use with or without runtime fallback. To - /// enable runtime fallback, use a [`LocaleFallbackProvider`]. - /// - /// This is the default fallback mode for the blob and filesystem providers. - /// - /// [`LocaleFallbackProvider`]: icu_provider_adapters::fallback::LocaleFallbackProvider - Hybrid, -} - -/// Options bag for [`DatagenProvider`](crate::DatagenProvider). -#[non_exhaustive] -#[derive(Debug, Clone, PartialEq, Default)] -pub struct Options { - /// The set of keys to generate. See [`icu_datagen::keys`], - /// [`icu_datagen::all_keys`], [`icu_datagen::key`] and [`icu_datagen::keys_from_bin`]. - /// - /// [`icu_datagen::keys`]: crate::keys - /// [`icu_datagen::all_keys`]: crate::all_keys - /// [`icu_datagen::key`]: crate::key - /// [`icu_datagen::keys_from_bin`]: crate::keys_from_bin - pub keys: HashSet, - /// Defines the locales to include - pub locales: LocaleInclude, - /// The collation types to include. - /// - /// The special string `"search*"` causes all search collation tables to be included. - pub collations: HashSet, - /// The type of fallback that the data should be generated for. If locale fallback is - /// used at runtime, smaller data can be generated. - pub fallback: FallbackMode, - /// The segmentation models to include - pub segmenter_models: SegmenterModelInclude, -} - -/// Defines the locales to include -#[non_exhaustive] -#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)] -pub enum LocaleInclude { - /// All locales - All, - /// No locales - None, - /// An explicit set of locales - Explicit(HashSet), - /// All locales with the given CLDR coverage levels - CldrSet(HashSet), - /// A recommended set of locales. - /// - /// This currently resolves to `CldrSet({Modern, Moderate, Basic})` but - /// might change in future releases. - Recommended, -} - -impl Default for LocaleInclude { - fn default() -> Self { - Self::All - } -} - -#[non_exhaustive] -#[derive(Debug, PartialEq, Clone, serde::Serialize, serde::Deserialize)] -/// The segmentation models to include -pub enum SegmenterModelInclude { - /// Include the recommended set of models. This will cover all languages supported - /// by ICU4X: Thai, Burmese, Khmer, Lao, Chinese, and Japanese. Both dictionary - /// and LSTM models will be included, to the extent required by the chosen data keys. - Recommended, - /// Include no dictionary or LSTM models. This will make line and word segmenters - /// behave like simple rule-based segmenters, which will be incorrect when handling text - /// that contains Thai, Burmese, Khmer, Lao, Chinese, or Japanese. - None, - /// Include an explicit list of LSTM or dictionary models, to the extent required by the - /// chosen data keys. - /// - /// The currently supported dictionary models are - /// * `cjdict` - /// * `burmesedict` - /// * `khmerdict` - /// * `laodict` - /// * `thaidict` - /// - /// The currently supported LSTM models are - /// * `Burmese_codepoints_exclusive_model4_heavy` - /// * `Khmer_codepoints_exclusive_model4_heavy` - /// * `Lao_codepoints_exclusive_model4_heavy` - /// * `Thai_codepoints_exclusive_model4_heavy` - /// - /// If a model is not included, the resulting line or word segmenter will apply rule-based - /// segmentation when encountering text in a script that requires the model, which will be - /// incorrect. - Explicit(Vec), -} - -impl Default for SegmenterModelInclude { - fn default() -> Self { - Self::Recommended - } -} diff --git a/provider/datagen/src/provider.rs b/provider/datagen/src/provider.rs new file mode 100644 index 00000000000..5660892844a --- /dev/null +++ b/provider/datagen/src/provider.rs @@ -0,0 +1,478 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#![allow(deprecated)] + +use crate::source::*; +use crate::transform::cldr::source::CldrCache; +use crate::{CollationHanDatabase, CoverageLevel}; +use icu_provider::prelude::*; +use std::fmt::Debug; +use std::path::PathBuf; +use std::sync::Arc; + +/// A [`DataProvider`] backed by raw CLDR and ICU data. +/// +/// This provider covers all keys that are used by ICU4X. It is intended as the canonical +/// provider for [`DatagenDriver::export`](crate::DatagenDriver::export). +/// +/// If a specific data source has not been set, `DataProvider::load` will +/// error ([`is_missing_cldr_error`](crate::is_missing_cldr_error) / +/// [`is_missing_icuexport_error`](crate::is_missing_icuexport_error)) / +/// [`is_missing_segmenter_lstm_error`](crate::is_missing_segmenter_lstm_error)) +/// if the data is required for that key. +#[allow(clippy::exhaustive_structs)] // any information will be added to SourceData +#[derive(Debug, Clone)] +pub struct DatagenProvider { + #[doc(hidden)] // semver + pub source: SourceData, +} + +impl Default for DatagenProvider { + fn default() -> Self { + Self { + source: SourceData { + cldr_paths: None, + icuexport_paths: None, + segmenter_lstm_paths: None, + trie_type: Default::default(), + collation_han_database: Default::default(), + #[cfg(feature = "legacy_api")] + icuexport_dictionary_fallback: None, + #[cfg(feature = "legacy_api")] + collations: Default::default(), + }, + } + } +} + +impl DatagenProvider { + /// The latest CLDR JSON tag that has been verified to work with this version of `icu_datagen`. + pub const LATEST_TESTED_CLDR_TAG: &'static str = "43.1.0"; + + /// The latest ICU export tag that has been verified to work with this version of `icu_datagen`. + pub const LATEST_TESTED_ICUEXPORT_TAG: &'static str = "icu4x/2023-05-02/73.x"; + + /// The latest segmentation LSTM model tag that has been verified to work with this version of `icu_datagen`. + pub const LATEST_TESTED_SEGMENTER_LSTM_TAG: &'static str = "v0.1.0"; + + /// A provider using the latest data that has been verified to work with this version of `icu_datagen`. + /// + /// See [`DatagenProvider::LATEST_TESTED_CLDR_TAG`], [`DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG`], + /// [`DatagenProvider::LATEST_TESTED_SEGMENTER_LSTM_TAG`]. + /// + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(feature = "networking")] + pub fn latest_tested() -> Self { + // Singleton so that all instantiations share the same cache. + static SINGLETON: once_cell::sync::OnceCell = + once_cell::sync::OnceCell::new(); + SINGLETON + .get_or_init(|| { + Self::default() + .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG) + .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) + .with_segmenter_lstm_for_tag(Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) + }) + .clone() + } + + #[cfg(test)] + pub fn latest_tested_offline_subset() -> Self { + // Singleton so that all instantiations share the same cache. + static SINGLETON: once_cell::sync::OnceCell = + once_cell::sync::OnceCell::new(); + SINGLETON + .get_or_init(|| { + // This is equivalent for the files defined in `tools/testdata-scripts/globs.rs.data`. + let data_root = + std::path::Path::new(core::env!("CARGO_MANIFEST_DIR")).join("tests/data"); + Self::default() + .with_cldr(data_root.join("cldr")) + .unwrap() + .with_icuexport(data_root.join("icuexport")) + .unwrap() + .with_segmenter_lstm(data_root.join("lstm")) + .unwrap() + }) + .clone() + } + + /// Adds CLDR data to this `SourceData`. The root should point to a local + /// `cldr-{tag}-json-full.zip` directory or ZIP file (see + /// [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). + pub fn with_cldr(self, root: PathBuf) -> Result { + Ok(Self { + source: SourceData { + cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new( + AbstractFs::new(root)?, + )))), + ..self.source + }, + }) + } + + /// Adds ICU export data to this `SourceData`. The path should point to a local + /// `icuexportdata_{tag}.zip` directory or ZIP file (see [GitHub releases]( + /// https://github.com/unicode-org/icu/releases)). + pub fn with_icuexport(self, root: PathBuf) -> Result { + Ok(Self { + source: SourceData { + icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new(root)?))), + ..self.source + }, + }) + } + + /// Adds segmenter LSTM data to this `SourceData`. The path should point to a local + /// `models.zip` directory or ZIP file (see [GitHub releases]( + /// https://github.com/unicode-org/lstm_word_segmentation/releases)). + pub fn with_segmenter_lstm(self, root: PathBuf) -> Result { + Ok(Self { + source: SourceData { + segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new(root)?))), + ..self.source + }, + }) + } + + /// Adds CLDR data to this `SourceData`. The data will be downloaded from GitHub + /// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). + /// + /// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG) + /// + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(feature = "networking")] + pub fn with_cldr_for_tag(self, tag: &str) -> Self { + Self { + source: SourceData { + cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!( + "https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip", + )))))), + ..self.source + } + } + } + + /// Adds ICU export data to this `SourceData`. The data will be downloaded from GitHub + /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)). + /// + /// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG) + /// + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(feature = "networking")] + pub fn with_icuexport_for_tag(self, mut tag: &str) -> Self { + if tag == "release-71-1" { + tag = "icu4x/2022-08-17/71.x"; + } + Self { + source: SourceData { + icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!( + "https://github.com/unicode-org/icu/releases/download/{tag}/icuexportdata_{}.zip", + tag.replace('/', "-") + ))))), + ..self.source + } + } + } + + /// Adds segmenter LSTM data to this `SourceData`. The data will be downloaded from GitHub + /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/lstm_word_segmentation/releases)). + /// + /// Also see: [`LATEST_TESTED_SEGMENTER_LSTM_TAG`](Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) + /// + /// ✨ *Enabled with the `networking` Cargo feature.* + #[cfg(feature = "networking")] + pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Self { + Self { source: SourceData { + segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!( + "https://github.com/unicode-org/lstm_word_segmentation/releases/download/{tag}/models.zip" + ))))), + ..self.source } + } + } + + /// Set this to use tries optimized for speed instead of data size + pub fn with_fast_tries(self) -> Self { + Self { + source: SourceData { + trie_type: TrieType::Fast, + ..self.source + }, + } + } + + /// Set the [`CollationHanDatabase`] version. + pub fn with_collation_han_database(self, collation_han_database: CollationHanDatabase) -> Self { + Self { + source: SourceData { + collation_han_database, + ..self.source + }, + } + } + + pub(crate) fn cldr(&self) -> Result<&CldrCache, DataError> { + self.source + .cldr_paths + .as_deref() + .ok_or(crate::error::MISSING_CLDR_ERROR) + } + + pub(crate) fn icuexport(&self) -> Result<&SerdeCache, DataError> { + self.source + .icuexport_paths + .as_deref() + .ok_or(crate::error::MISSING_ICUEXPORT_ERROR) + } + + pub(crate) fn segmenter_lstm(&self) -> Result<&SerdeCache, DataError> { + self.source + .segmenter_lstm_paths + .as_deref() + .ok_or(crate::error::MISSING_SEGMENTER_LSTM_ERROR) + } + + pub(crate) fn trie_type(&self) -> TrieType { + self.source.trie_type + } + + pub(crate) fn collation_han_database(&self) -> CollationHanDatabase { + self.source.collation_han_database + } + + /// List the locales for the given CLDR coverage levels + pub fn locales_for_coverage_levels( + &self, + levels: impl IntoIterator, + ) -> Result, DataError> { + self.cldr()?.locales(levels) + } +} + +/// Specifies the trie type to use. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] +#[doc(hidden)] +#[non_exhaustive] +pub enum TrieType { + /// Fast tries are optimized for speed + #[serde(rename = "fast")] + Fast, + /// Small tries are optimized for size + #[serde(rename = "small")] + #[default] + Small, +} + +impl std::fmt::Display for TrieType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + TrieType::Fast => write!(f, "fast"), + TrieType::Small => write!(f, "small"), + } + } +} + +// SEMVER GRAVEYARD + +/// Requires `legacy_api` Cargo feature +/// +/// Bag of options for datagen source data. +/// +/// Warning: this includes hardcoded segmentation data for backwards compatibility. +/// It is strongly discouraged to keep using this API, instead use [`DatagenProvider`] +/// and set segmentation data explicitly. +#[derive(Clone, Debug)] +#[non_exhaustive] +#[deprecated(since = "1.3.0", note = "use `DatagenProvider`")] +pub struct SourceData { + cldr_paths: Option>, + icuexport_paths: Option>, + segmenter_lstm_paths: Option>, + trie_type: TrieType, + collation_han_database: CollationHanDatabase, + #[cfg(feature = "legacy_api")] + // populated if constructed through `SourceData` constructor only + pub(crate) icuexport_dictionary_fallback: Option>, + #[cfg(feature = "legacy_api")] + pub(crate) collations: Vec, +} + +#[cfg(feature = "legacy_api")] +impl Default for SourceData { + fn default() -> Self { + Self { + icuexport_dictionary_fallback: Some(Arc::new(SerdeCache::new(AbstractFs::Memory( + [ + ( + "segmenter/dictionary/cjdict.toml", + include_bytes!("../tests/data/icuexport/segmenter/dictionary/cjdict.toml").as_slice(), + ), + ( + "segmenter/dictionary/khmerdict.toml", + include_bytes!("../tests/data/icuexport/segmenter/dictionary/khmerdict.toml").as_slice(), + ), + ( + "segmenter/dictionary/laodict.toml", + include_bytes!("../tests/data/icuexport/segmenter/dictionary/laodict.toml").as_slice(), + ), + ( + "segmenter/dictionary/burmesedict.toml", + include_bytes!("../tests/data/icuexport/segmenter/dictionary/burmesedict.toml").as_slice(), + ), + ( + "segmenter/dictionary/thaidict.toml", + include_bytes!("../tests/data/icuexport/segmenter/dictionary/thaidict.toml").as_slice(), + ), + ] + .into_iter() + .collect(), + )))), + segmenter_lstm_paths: Some(Arc::new(SerdeCache::new(AbstractFs::Memory( + [ + ( + "Khmer_codepoints_exclusive_model4_heavy/weights.json", + include_bytes!( + "../tests/data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json" + ) + .as_slice(), + ), + ( + "Lao_codepoints_exclusive_model4_heavy/weights.json", + include_bytes!( + "../tests/data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json" + ) + .as_slice(), + ), + ( + "Burmese_codepoints_exclusive_model4_heavy/weights.json", + include_bytes!( + "../tests/data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json" + ) + .as_slice(), + ), + ( + "Thai_codepoints_exclusive_model4_heavy/weights.json", + include_bytes!( + "../tests/data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json" + ) + .as_slice(), + ), + ( + "Thai_graphclust_model4_heavy/weights.json", + include_bytes!("../tests/data/lstm/Thai_graphclust_model4_heavy/weights.json") + .as_slice(), + ), + ] + .into_iter() + .collect(), + )))), + ..DatagenProvider::default().source + } + } +} + +#[cfg(feature = "legacy_api")] +impl SourceData { + /// See [`DatagenProvider::LATEST_TESTED_CLDR_TAG`] + pub const LATEST_TESTED_CLDR_TAG: &'static str = DatagenProvider::LATEST_TESTED_CLDR_TAG; + + /// See [`DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG`] + pub const LATEST_TESTED_ICUEXPORT_TAG: &'static str = + DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG; + + #[cfg(feature = "networking")] + /// See [`DatagenProvider::latest_tested`] + pub fn latest_tested() -> Self { + DatagenProvider::latest_tested().source + } + + /// See [`DatagenProvider::with_cldr`] + pub fn with_cldr( + self, + root: PathBuf, + _use_default_here: crate::CldrLocaleSubset, + ) -> Result { + Ok(DatagenProvider { source: self }.with_cldr(root)?.source) + } + + /// See [`DatagenProvider::with_icuexport`] + pub fn with_icuexport(self, root: PathBuf) -> Result { + Ok(DatagenProvider { source: self } + .with_icuexport(root)? + .source) + } + + #[cfg(feature = "networking")] + /// See [`DatagenProvider::with_cldr_for_tag`] + pub fn with_cldr_for_tag( + self, + tag: &str, + _use_default_here: crate::CldrLocaleSubset, + ) -> Result { + Ok(DatagenProvider { source: self } + .with_cldr_for_tag(tag) + .source) + } + + #[cfg(feature = "networking")] + /// See [`DatagenProvider::with_icuexport_for_tag`] + pub fn with_icuexport_for_tag(self, tag: &str) -> Result { + Ok(DatagenProvider { source: self } + .with_icuexport_for_tag(tag) + .source) + } + + #[deprecated( + since = "1.1.0", + note = "Use `DatagenProvider::with_cldr_for_tag(DatagenProvider::LATEST_TESTED_CLDR_TAG)`" + )] + #[cfg(feature = "networking")] + /// See [`DatagenProvider::with_cldr_for_tag`] + pub fn with_cldr_latest( + self, + _use_default_here: crate::CldrLocaleSubset, + ) -> Result { + self.with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) + } + + #[deprecated( + since = "1.1.0", + note = "Use `DatagenProvider::with_icuexport_for_tag(DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG)`" + )] + #[cfg(feature = "networking")] + /// See [`DatagenProvider::with_icuexport_for_tag`] + pub fn with_icuexport_latest(self) -> Result { + self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) + } + + /// See [`DatagenProvider::with_fast_tries`] + pub fn with_fast_tries(self) -> Self { + DatagenProvider { source: self }.with_fast_tries().source + } + + /// See [`DatagenProvider::with_collation_han_database`] + pub fn with_collation_han_database(self, collation_han_database: CollationHanDatabase) -> Self { + DatagenProvider { source: self } + .with_collation_han_database(collation_han_database) + .source + } + + #[cfg(feature = "legacy_api")] + /// See [`DatagenDriver::with_collations`](crate::DatagenDriver::with_collations) + pub fn with_collations(self, collations: Vec) -> Self { + Self { collations, ..self } + } + + /// List the locales for the given CLDR coverage levels + pub fn locales( + &self, + levels: &[CoverageLevel], + ) -> Result, DataError> { + self.cldr_paths + .as_deref() + .ok_or(crate::error::MISSING_CLDR_ERROR)? + .locales(levels.iter().copied()) + } +} diff --git a/provider/datagen/src/source.rs b/provider/datagen/src/source.rs index 68b5b71f316..28a692c9f85 100644 --- a/provider/datagen/src/source.rs +++ b/provider/datagen/src/source.rs @@ -2,12 +2,12 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::transform::cldr::source::CldrCache; -pub use crate::transform::cldr::source::CoverageLevel; use elsa::sync::FrozenMap; use icu_provider::prelude::*; use std::any::Any; -use std::collections::{BTreeMap, HashSet}; +#[cfg(feature = "legacy_api")] +use std::collections::BTreeMap; +use std::collections::HashSet; use std::fmt::Debug; #[cfg(feature = "networking")] use std::fs::File; @@ -17,287 +17,9 @@ use std::io::Cursor; use std::io::Read; use std::path::Path; use std::path::PathBuf; -use std::sync::Arc; use std::sync::RwLock; use zip::ZipArchive; -/// Bag of options for datagen source data. -#[derive(Clone, Debug)] -#[non_exhaustive] -pub struct SourceData { - cldr_paths: Option>, - icuexport_paths: Option>, - icuexport_fallback_paths: Arc, - segmenter_lstm_paths: Arc, - pub(crate) trie_type: TrieType, - pub(crate) collation_han_database: CollationHanDatabase, - pub(crate) collations: Vec, -} - -#[cfg(feature = "networking")] -/// The default [`SourceData`] downloads the latest supported data. -/// -/// Requires `networking` Cargo feature. -impl Default for SourceData { - fn default() -> Self { - Self::offline() - .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) - .unwrap() - .with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) - .unwrap() - .with_segmenter_lstm_for_tag(Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) - .unwrap() - } -} - -impl SourceData { - /// The latest CLDR JSON tag that has been verified to work with this version of `icu_datagen`. - pub const LATEST_TESTED_CLDR_TAG: &'static str = "43.1.0"; - - /// The latest ICU export tag that has been verified to work with this version of `icu_datagen`. - pub const LATEST_TESTED_ICUEXPORT_TAG: &'static str = "icu4x/2023-05-02/73.x"; - - /// The latest segmentation LSTM model tag that has been verified to work with this version of `icu_datagen`. - pub const LATEST_TESTED_SEGMENTER_LSTM_TAG: &'static str = "v0.1.0"; - - #[doc(hidden)] - #[cfg(feature = "networking")] - #[deprecated(since = "1.3.0", note = "use SourceData::default()")] - pub fn latest_tested() -> Self { - Self::default() - } - - /// Creates a `SourceData` that does not have CLDR or ICU export sources set. - pub fn offline() -> Self { - Self { - cldr_paths: None, - icuexport_paths: None, - icuexport_fallback_paths: Arc::new(SerdeCache::new( - AbstractFs::new_icuexport_fallback(), - )), - segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm_fallback())), - trie_type: Default::default(), - collation_han_database: Default::default(), - collations: Default::default(), - } - } - - /// Adds CLDR data to this `SourceData`. The root should point to a local - /// `cldr-{tag}-json-full.zip` directory or ZIP file (see - /// [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). - pub fn with_cldr( - self, - root: PathBuf, - _use_default_here: crate::CldrLocaleSubset, - ) -> Result { - let root = AbstractFs::new(root)?; - Ok(Self { - cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(root)))), - ..self - }) - } - - /// Adds ICU export data to this `SourceData`. The path should point to a local - /// `icuexportdata_{tag}.zip` directory or ZIP file (see [GitHub releases]( - /// https://github.com/unicode-org/icu/releases)). - pub fn with_icuexport(self, root: PathBuf) -> Result { - Ok(Self { - icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new(root)?))), - ..self - }) - } - - /// Adds segmenter LSTM data to this `SourceData`. The path should point to a local - /// `models.zip` directory or ZIP file (see [GitHub releases]( - /// https://github.com/unicode-org/lstm_word_segmentation/releases)). - pub fn with_segmenter_lstm(self, root: PathBuf) -> Result { - Ok(Self { - segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new(root)?)), - ..self - }) - } - - /// Adds CLDR data to this `SourceData`. The data will be downloaded from GitHub - /// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). - /// - /// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG) - /// - /// Requires `networking` Cargo feature. - #[cfg(feature = "networking")] - pub fn with_cldr_for_tag( - self, - tag: &str, - _use_default_here: crate::CldrLocaleSubset, - ) -> Result { - Ok(Self { - cldr_paths: Some(Arc::new(CldrCache::from_serde_cache(SerdeCache::new(AbstractFs::new_from_url(format!( - "https://github.com/unicode-org/cldr-json/releases/download/{tag}/cldr-{tag}-json-full.zip", - )))) - )), - ..self - }) - } - - /// Adds ICU export data to this `SourceData`. The data will be downloaded from GitHub - /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)). - /// - /// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG) - /// - /// Requires `networking` Cargo feature. - #[cfg(feature = "networking")] - pub fn with_icuexport_for_tag(self, mut tag: &str) -> Result { - if tag == "release-71-1" { - tag = "icu4x/2022-08-17/71.x"; - } - Ok(Self { - icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::new_from_url( - format!( - "https://github.com/unicode-org/icu/releases/download/{tag}/icuexportdata_{}.zip", - tag.replace('/', "-") - ), - )))), - ..self - }) - } - - /// Adds segmenter LSTM data to this `SourceData`. The data will be downloaded from GitHub - /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/lstm_word_segmentation/releases)). - /// - /// Also see: [`LATEST_TESTED_SEGMENTER_LSTM_TAG`](Self::LATEST_TESTED_SEGMENTER_LSTM_TAG) - /// - /// Requires `networking` Cargo feature. - #[cfg(feature = "networking")] - pub fn with_segmenter_lstm_for_tag(self, tag: &str) -> Result { - Ok(Self { - segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_from_url(format!( - "https://github.com/unicode-org/lstm_word_segmentation/releases/download/{tag}/models.zip" - )))), - ..self - }) - } - - #[deprecated( - since = "1.1.0", - note = "Use `with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG)`" - )] - #[cfg(feature = "networking")] - #[doc(hidden)] - pub fn with_cldr_latest( - self, - _use_default_here: crate::CldrLocaleSubset, - ) -> Result { - self.with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) - } - - #[deprecated( - since = "1.1.0", - note = "Use `with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)`" - )] - #[cfg(feature = "networking")] - #[doc(hidden)] - pub fn with_icuexport_latest(self) -> Result { - self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) - } - - /// Set this to use tries optimized for speed instead of data size - pub fn with_fast_tries(self) -> Self { - Self { - trie_type: TrieType::Fast, - ..self - } - } - - /// Set the [`CollationHanDatabase`] version. - pub fn with_collation_han_database(self, collation_han_database: CollationHanDatabase) -> Self { - Self { - collation_han_database, - ..self - } - } - - #[deprecated(note = "use crate::Options", since = "1.3.0")] - #[doc(hidden)] - pub fn with_collations(self, collations: Vec) -> Self { - Self { collations, ..self } - } - - pub(crate) fn cldr(&self) -> Result<&CldrCache, DataError> { - self.cldr_paths - .as_deref() - .ok_or(crate::error::MISSING_CLDR_ERROR) - } - - pub(crate) fn icuexport(&self) -> Result<&SerdeCache, DataError> { - self.icuexport_paths - .as_deref() - .ok_or(crate::error::MISSING_ICUEXPORT_ERROR) - } - - pub(crate) fn icuexport_fallback(&self) -> &SerdeCache { - &self.icuexport_fallback_paths - } - - pub(crate) fn segmenter_lstm(&self) -> Result<&SerdeCache, DataError> { - Ok(&self.segmenter_lstm_paths) - } - - /// List the locales for the given CLDR coverage levels - pub fn locales( - &self, - levels: &[CoverageLevel], - ) -> Result, DataError> { - self.cldr()?.locales(levels) - } -} - -/// Specifies the trie type to use. -#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] -#[doc(hidden)] -#[non_exhaustive] -pub enum TrieType { - /// Fast tries are optimized for speed - #[serde(rename = "fast")] - Fast, - /// Small tries are optimized for size - #[serde(rename = "small")] - #[default] - Small, -} - -impl std::fmt::Display for TrieType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - TrieType::Fast => write!(f, "fast"), - TrieType::Small => write!(f, "small"), - } - } -} - -/// Specifies the collation Han database to use. -/// -/// Unihan is more precise but significantly increases data size. See -/// -#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)] -#[non_exhaustive] -pub enum CollationHanDatabase { - /// Implicit - #[serde(rename = "implicit")] - #[default] - Implicit, - /// Unihan - #[serde(rename = "unihan")] - Unihan, -} - -impl std::fmt::Display for CollationHanDatabase { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - match self { - CollationHanDatabase::Implicit => write!(f, "implicithan"), - CollationHanDatabase::Unihan => write!(f, "unihan"), - } - } -} - pub(crate) struct SerdeCache { root: AbstractFs, cache: FrozenMap>, @@ -379,6 +101,7 @@ pub(crate) struct ZipData { pub(crate) enum AbstractFs { Fs(PathBuf), Zip(RwLock>), + #[cfg(feature = "legacy_api")] Memory(BTreeMap<&'static str, &'static [u8]>), } @@ -389,7 +112,7 @@ impl Debug for AbstractFs { } impl AbstractFs { - fn new>(root: P) -> Result { + pub fn new>(root: P) -> Result { if std::fs::metadata(root.as_ref()) .map_err(|e| DataError::from(e).with_path_context(root.as_ref()))? .is_dir() @@ -406,79 +129,8 @@ impl AbstractFs { } } - fn new_icuexport_fallback() -> Self { - Self::Memory( - [ - ( - "segmenter/dictionary/cjdict.toml", - include_bytes!("../data/segmenter/dictionary/cjdict.toml").as_slice(), - ), - ( - "segmenter/dictionary/khmerdict.toml", - include_bytes!("../data/segmenter/dictionary/khmerdict.toml").as_slice(), - ), - ( - "segmenter/dictionary/laodict.toml", - include_bytes!("../data/segmenter/dictionary/laodict.toml").as_slice(), - ), - ( - "segmenter/dictionary/burmesedict.toml", - include_bytes!("../data/segmenter/dictionary/burmesedict.toml").as_slice(), - ), - ( - "segmenter/dictionary/thaidict.toml", - include_bytes!("../data/segmenter/dictionary/thaidict.toml").as_slice(), - ), - ] - .into_iter() - .collect(), - ) - } - - fn new_lstm_fallback() -> Self { - Self::Memory( - [ - ( - "Khmer_codepoints_exclusive_model4_heavy/weights.json", - include_bytes!( - "../data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json" - ) - .as_slice(), - ), - ( - "Lao_codepoints_exclusive_model4_heavy/weights.json", - include_bytes!( - "../data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json" - ) - .as_slice(), - ), - ( - "Burmese_codepoints_exclusive_model4_heavy/weights.json", - include_bytes!( - "../data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json" - ) - .as_slice(), - ), - ( - "Thai_codepoints_exclusive_model4_heavy/weights.json", - include_bytes!( - "../data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json" - ) - .as_slice(), - ), - ( - "Thai_graphclust_model4_heavy/weights.json", - include_bytes!("../data/lstm/Thai_graphclust_model4_heavy/weights.json") - .as_slice(), - ), - ] - .into_iter() - .collect(), - ) - } - #[cfg(feature = "networking")] - fn new_from_url(path: String) -> Self { + pub fn new_from_url(path: String) -> Self { Self::Zip(RwLock::new(Err(path))) } @@ -552,6 +204,7 @@ impl AbstractFs { .read_to_end(&mut buf)?; Ok(buf) } + #[cfg(feature = "legacy_api")] Self::Memory(map) => map.get(path).copied().map(Vec::from).ok_or_else(|| { DataError::custom("Not found in icu4x-datagen's data/").with_display_context(path) }), @@ -579,6 +232,7 @@ impl AbstractFs { .map(String::from) .collect::>() .into_iter(), + #[cfg(feature = "legacy_api")] Self::Memory(map) => map .keys() .copied() @@ -600,6 +254,7 @@ impl AbstractFs { .unwrap() // init called .file_list .contains(path), + #[cfg(feature = "legacy_api")] Self::Memory(map) => map.contains_key(path), }) } diff --git a/provider/datagen/src/transform/cldr/calendar/japanese.rs b/provider/datagen/src/transform/cldr/calendar/japanese.rs index 335a648a242..69f69657374 100644 --- a/provider/datagen/src/transform/cldr/calendar/japanese.rs +++ b/provider/datagen/src/transform/cldr/calendar/japanese.rs @@ -25,7 +25,6 @@ impl crate::DatagenProvider { // in the `en` locale. We load this data to construct era codes but // actual user code only needs to load the data for the locales it cares about. let era_name_map = &self - .source .cldr()? .dates("japanese") .read_and_parse::(&langid!("en"), "ca-japanese.json")? @@ -41,7 +40,6 @@ impl crate::DatagenProvider { .abbr; let era_dates_map = &self - .source .cldr()? .core() .read_and_parse::("supplemental/calendarData.json")? diff --git a/provider/datagen/src/transform/cldr/characters/mod.rs b/provider/datagen/src/transform/cldr/characters/mod.rs index a9dfbed9f61..204b893ae01 100644 --- a/provider/datagen/src/transform/cldr/characters/mod.rs +++ b/provider/datagen/src/transform/cldr/characters/mod.rs @@ -26,7 +26,6 @@ macro_rules! exemplar_chars_impls { let langid = req.locale.get_langid(); let data: &cldr_serde::exemplar_chars::Resource = self - .source .cldr()? .misc() .read_and_parse(&langid, "characters.json")?; @@ -50,7 +49,6 @@ macro_rules! exemplar_chars_impls { impl IterableDataProvider<$data_marker_name> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .misc() .list_langs()? @@ -467,7 +465,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/cldr_serde/coverage_levels.rs b/provider/datagen/src/transform/cldr/cldr_serde/coverage_levels.rs index df47735d90e..caafef4a0b1 100644 --- a/provider/datagen/src/transform/cldr/cldr_serde/coverage_levels.rs +++ b/provider/datagen/src/transform/cldr/cldr_serde/coverage_levels.rs @@ -9,6 +9,5 @@ use std::collections::HashMap; #[derive(PartialEq, Debug, Deserialize)] pub struct Resource { #[serde(rename = "coverageLevels")] - pub coverage_levels: - HashMap, + pub coverage_levels: HashMap, } diff --git a/provider/datagen/src/transform/cldr/cldr_serde/mod.rs b/provider/datagen/src/transform/cldr/cldr_serde/mod.rs index 76a27ac0324..8a2bb1a6d52 100644 --- a/provider/datagen/src/transform/cldr/cldr_serde/mod.rs +++ b/provider/datagen/src/transform/cldr/cldr_serde/mod.rs @@ -10,10 +10,13 @@ pub mod aliases; pub mod ca; pub mod coverage_levels; +#[cfg(feature = "icu_singlenumberformatter")] pub mod currencies; pub mod currency_data; +#[cfg(feature = "icu_relativetime")] pub mod date_fields; pub mod directionality; +#[cfg(feature = "icu_displaynames")] pub mod displaynames; pub mod exemplar_chars; pub mod japanese; diff --git a/provider/datagen/src/transform/cldr/currency/mod.rs b/provider/datagen/src/transform/cldr/currency/mod.rs index bb458ea9f79..2bb3ed3ba4b 100644 --- a/provider/datagen/src/transform/cldr/currency/mod.rs +++ b/provider/datagen/src/transform/cldr/currency/mod.rs @@ -75,13 +75,11 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let currencies_resource: &cldr_serde::currencies::Resource = self - .source .cldr()? .numbers() .read_and_parse(&langid, "currencies.json")?; let numbers_resource: &cldr_serde::numbers::Resource = self - .source .cldr()? .numbers() .read_and_parse(&langid, "numbers.json")?; @@ -98,7 +96,6 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .numbers() .list_langs()? @@ -291,7 +288,7 @@ fn test_basic() { use icu_locid::locale; use icu_singlenumberformatter::provider::*; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let en: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/datetime/mod.rs b/provider/datagen/src/transform/cldr/datetime/mod.rs index 6ecd3c509d9..8df4176febb 100644 --- a/provider/datagen/src/transform/cldr/datetime/mod.rs +++ b/provider/datagen/src/transform/cldr/datetime/mod.rs @@ -70,7 +70,6 @@ macro_rules! impl_data_provider { .ok_or_else(|| DataErrorKind::MissingLocale.into_error())?; let resource: &cldr_serde::ca::Resource = self - .source .cldr()? .dates(cldr_cal) .read_and_parse(&langid, &format!("ca-{}.json", cldr_cal))?; @@ -89,7 +88,6 @@ macro_rules! impl_data_provider { // ethiopian. if calendar == value!("ethiopic") { let ethioaa: &cldr_serde::ca::Resource = self - .source .cldr()? .dates("ethiopic") .read_and_parse(&langid, "ca-ethiopic-amete-alem.json")?; @@ -130,16 +128,18 @@ macro_rules! impl_data_provider { // Filter out non-modern eras if calendar != value!("japanext") { let era_dates: &cldr_serde::japanese::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/calendarData.json")?; let mut set = HashSet::::new(); - for (era_index, date) in - era_dates.supplemental.calendar_data.japanese.eras.iter() - { + for (era_index, date) in era_dates.supplemental.calendar_data.japanese.eras.iter() { let start_date = - EraStartDate::from_str(if let Some(start_date) = date.start.as_ref() { start_date } else { continue }).map_err(|_| { + EraStartDate::from_str(if let Some(start_date) = date.start.as_ref() { + start_date + } else { + continue; + }) + .map_err(|_| { DataError::custom( "calendarData.json contains unparseable data for a japanese era", ) @@ -158,7 +158,6 @@ macro_rules! impl_data_provider { // Splice in gregorian data for pre-meiji let greg_resource: &cldr_serde::ca::Resource = self - .source .cldr()? .dates("gregorian") .read_and_parse(&langid, "ca-gregorian.json")?; @@ -238,41 +237,32 @@ macro_rules! impl_data_provider { let mut r = Vec::new(); if DateSkeletonPatternsV1Marker::KEY == $marker::KEY { for (cal_value, cldr_cal) in supported_cals() { - r.extend( - self.source - .cldr()? - .dates(cldr_cal) - .list_langs()? - .map(|lid| { - let mut locale: Locale = lid.into(); - locale - .extensions - .unicode - .keywords - .set(key!("ca"), cal_value.clone()); - DataLocale::from(locale) - }), - ); + r.extend(self.cldr()?.dates(cldr_cal).list_langs()?.map(|lid| { + let mut locale: Locale = lid.into(); + locale + .extensions + .unicode + .keywords + .set(key!("ca"), cal_value.clone()); + DataLocale::from(locale) + })); } } else { let cldr_cal = supported_cals() .get(&value!($calendar)) .ok_or_else(|| DataErrorKind::MissingLocale.into_error())?; - r.extend( - self.source - .cldr()? - .dates(cldr_cal) - .list_langs()? - .map(|lid| { - let locale: Locale = lid.into(); - DataLocale::from(locale) - }), - ); + r.extend(self.cldr()?.dates(cldr_cal).list_langs()?.map(|lid| { + let locale: Locale = lid.into(); + DataLocale::from(locale) + })); } // TODO(#3212): Remove if $marker::KEY == TimeLengthsV1Marker::KEY { - r.retain(|l| l.get_langid() != icu_locid::langid!("byn") && l.get_langid() != icu_locid::langid!("ssy")); + r.retain(|l| { + l.get_langid() != icu_locid::langid!("byn") + && l.get_langid() != icu_locid::langid!("ssy") + }); } Ok(r) @@ -416,7 +406,7 @@ mod test { #[test] fn test_basic_patterns() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider @@ -433,7 +423,7 @@ mod test { #[test] fn test_with_numbering_system() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let locale: Locale = locale!("haw"); let cs_dates: DataPayload = provider @@ -456,7 +446,7 @@ mod test { use icu_plurals::PluralCategory; use std::convert::TryFrom; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let locale: Locale = "fil-u-ca-gregory".parse().unwrap(); let skeletons: DataPayload = provider @@ -500,7 +490,7 @@ mod test { fn test_basic_symbols() { use icu_calendar::types::MonthCode; use tinystr::tinystr; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider @@ -531,7 +521,7 @@ mod test { #[test] fn unalias_contexts() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let locale: Locale = locale!("cs"); let cs_dates: DataPayload = provider diff --git a/provider/datagen/src/transform/cldr/datetime/week_data.rs b/provider/datagen/src/transform/cldr/datetime/week_data.rs index 33c1b11217a..ae4f1b39358 100644 --- a/provider/datagen/src/transform/cldr/datetime/week_data.rs +++ b/provider/datagen/src/transform/cldr/datetime/week_data.rs @@ -15,7 +15,6 @@ use std::collections::HashSet; impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { let week_data: &cldr_serde::week_data::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/weekData.json")?; @@ -47,7 +46,6 @@ impl DataProvider for crate::DatagenProvider { .unwrap_or_else(|| DEFAULT_TERRITORY.clone()); let week_data: &cldr_serde::week_data::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/weekData.json")?; @@ -82,7 +80,7 @@ fn basic_cldr_week_data() { use icu_calendar::types::IsoWeekday; use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let default_week_data: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/cldr/decimal/compact.rs b/provider/datagen/src/transform/cldr/decimal/compact.rs index 369c9c8f7d6..36fcd33ad16 100644 --- a/provider/datagen/src/transform/cldr/decimal/compact.rs +++ b/provider/datagen/src/transform/cldr/decimal/compact.rs @@ -18,7 +18,6 @@ impl DataProvider for crate::DatagenProvi let langid = req.locale.get_langid(); let resource: &cldr_serde::numbers::Resource = self - .source .cldr()? .numbers() .read_and_parse(&langid, "numbers.json")?; @@ -67,7 +66,6 @@ impl DataProvider for crate::DatagenProvid let langid = req.locale.get_langid(); let resource: &cldr_serde::numbers::Resource = self - .source .cldr()? .numbers() .read_and_parse(&langid, "numbers.json")?; @@ -131,7 +129,7 @@ mod tests { #[test] fn test_compact_long() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let fr_compact_long: DataPayload = provider .load(DataRequest { @@ -197,7 +195,7 @@ mod tests { #[test] fn test_compact_short() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let ja_compact_short: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/decimal/mod.rs b/provider/datagen/src/transform/cldr/decimal/mod.rs index 51f604510cd..3aa54a6e692 100644 --- a/provider/datagen/src/transform/cldr/decimal/mod.rs +++ b/provider/datagen/src/transform/cldr/decimal/mod.rs @@ -23,7 +23,6 @@ impl crate::DatagenProvider { nsname: TinyAsciiStr<8>, ) -> Result<[char; 10], DataError> { let resource: &cldr_serde::numbering_systems::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/numberingSystems.json")?; @@ -58,7 +57,6 @@ impl crate::DatagenProvider { langid: &LanguageIdentifier, ) -> Result>, DataError> { let resource: &cldr_serde::numbers::Resource = self - .source .cldr()? .numbers() .read_and_parse(langid, "numbers.json")?; @@ -76,7 +74,6 @@ impl crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .numbers() .list_langs()? diff --git a/provider/datagen/src/transform/cldr/decimal/symbols.rs b/provider/datagen/src/transform/cldr/decimal/symbols.rs index 9ed0b891c37..f2eb6d5a542 100644 --- a/provider/datagen/src/transform/cldr/decimal/symbols.rs +++ b/provider/datagen/src/transform/cldr/decimal/symbols.rs @@ -17,7 +17,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let resource: &cldr_serde::numbers::Resource = self - .source .cldr()? .numbers() .read_and_parse(&langid, "numbers.json")?; @@ -96,7 +95,7 @@ impl TryFrom> for DecimalSymbolsV1<'static> { fn test_basic() { use icu_locid::locale; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let ar_decimal: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/language.rs b/provider/datagen/src/transform/cldr/displaynames/language.rs index 9058902dd01..d6e26d0c6e1 100644 --- a/provider/datagen/src/transform/cldr/displaynames/language.rs +++ b/provider/datagen/src/transform/cldr/displaynames/language.rs @@ -20,7 +20,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let data: &cldr_serde::displaynames::language::Resource = self - .source .cldr()? .displaynames() .read_and_parse(&langid, "languages.json")?; @@ -44,7 +43,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let data: &cldr_serde::displaynames::language::Resource = self - .source .cldr()? .displaynames() .read_and_parse(&langid, "languages.json")?; @@ -63,14 +61,12 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .displaynames() .list_langs()? .filter(|langid| { // The directory might exist without languages.json - self.source - .cldr() + self.cldr() .unwrap() .displaynames() .file_exists(langid, "languages.json") @@ -84,14 +80,12 @@ impl IterableDataProvider for crate::DatagenProvid impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .displaynames() .list_langs()? .filter(|langid| { // The directory might exist without languages.json - self.source - .cldr() + self.cldr() .unwrap() .displaynames() .file_exists(langid, "languages.json") @@ -218,7 +212,7 @@ mod tests { #[test] fn test_basic_lang_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -240,7 +234,7 @@ mod tests { #[test] fn test_basic_lang_short_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -262,7 +256,7 @@ mod tests { #[test] fn test_basic_lang_long_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -284,7 +278,7 @@ mod tests { #[test] fn test_basic_lang_menu_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -306,7 +300,7 @@ mod tests { #[test] fn test_basic_locale_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/region.rs b/provider/datagen/src/transform/cldr/displaynames/region.rs index 993195de186..9453f7e7ca4 100644 --- a/provider/datagen/src/transform/cldr/displaynames/region.rs +++ b/provider/datagen/src/transform/cldr/displaynames/region.rs @@ -20,7 +20,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let data: &cldr_serde::displaynames::region::Resource = self - .source .cldr()? .displaynames() .read_and_parse(&langid, "territories.json")?; @@ -39,14 +38,12 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .displaynames() .list_langs()? .filter(|langid| { // The directory might exist without territories.json - self.source - .cldr() + self.cldr() .unwrap() .displaynames() .file_exists(langid, "territories.json") @@ -97,7 +94,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -119,7 +116,7 @@ mod tests { #[test] fn test_basic_short_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/script.rs b/provider/datagen/src/transform/cldr/displaynames/script.rs index e222244035b..c5f8aad7142 100644 --- a/provider/datagen/src/transform/cldr/displaynames/script.rs +++ b/provider/datagen/src/transform/cldr/displaynames/script.rs @@ -20,7 +20,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let data: &cldr_serde::displaynames::script::Resource = self - .source .cldr()? .displaynames() .read_and_parse(&langid, "scripts.json")?; @@ -39,14 +38,12 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .displaynames() .list_langs()? .filter(|langid| { // The directory might exist without scripts.json - self.source - .cldr() + self.cldr() .unwrap() .displaynames() .file_exists(langid, "scripts.json") @@ -99,7 +96,7 @@ mod tests { #[test] fn test_basic_script_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { @@ -121,7 +118,7 @@ mod tests { #[test] fn test_basic_script_short_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/displaynames/variant.rs b/provider/datagen/src/transform/cldr/displaynames/variant.rs index fc493e13cb8..14d6cec1a62 100644 --- a/provider/datagen/src/transform/cldr/displaynames/variant.rs +++ b/provider/datagen/src/transform/cldr/displaynames/variant.rs @@ -20,7 +20,6 @@ impl DataProvider for crate::DatagenProvider { let langid = req.locale.get_langid(); let data: &cldr_serde::displaynames::variant::Resource = self - .source .cldr()? .displaynames() .read_and_parse(&langid, "variants.json")?; @@ -39,14 +38,12 @@ impl DataProvider for crate::DatagenProvider { impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .displaynames() .list_langs()? .filter(|langid| { // The directory might exist without variants.json - self.source - .cldr() + self.cldr() .unwrap() .displaynames() .file_exists(langid, "variants.json") @@ -89,7 +86,7 @@ mod tests { #[test] fn test_basic_variant_display_names() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/cldr/fallback/mod.rs b/provider/datagen/src/transform/cldr/fallback/mod.rs index 2f0ba1bae99..e31450ca228 100644 --- a/provider/datagen/src/transform/cldr/fallback/mod.rs +++ b/provider/datagen/src/transform/cldr/fallback/mod.rs @@ -24,7 +24,7 @@ impl DataProvider for crate::DatagenProvide req: DataRequest, ) -> Result, DataError> { self.check_req::(req)?; - let resources = LikelySubtagsResources::try_from_source_data(&self.source)?; + let resources = LikelySubtagsResources::try_from_cldr_cache(self.cldr()?)?; let metadata = DataResponseMetadata::default(); Ok(DataResponse { @@ -41,7 +41,6 @@ impl DataProvider for crate::DatagenProvider { ) -> Result, DataError> { self.check_req::(req)?; let parents_data: &cldr_serde::parent_locales::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/parentLocales.json")?; @@ -205,7 +204,7 @@ fn test_basic() { subtags::{language, region, script}, }; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let likely_subtags: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/list/mod.rs b/provider/datagen/src/transform/cldr/list/mod.rs index 484cd8f957e..040ca1fa566 100644 --- a/provider/datagen/src/transform/cldr/list/mod.rs +++ b/provider/datagen/src/transform/cldr/list/mod.rs @@ -17,7 +17,6 @@ fn load>>( let langid = req.locale.get_langid(); let resource: &cldr_serde::list_patterns::Resource = selff - .source .cldr()? .misc() .read_and_parse(&langid, "listPatterns.json")?; @@ -129,7 +128,6 @@ macro_rules! implement { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .misc() .list_langs()? diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs index 7670faf953a..e291e8102a5 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/aliases.rs @@ -18,7 +18,6 @@ impl DataProvider for crate::DatagenProvider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::(req)?; let data: &cldr_serde::aliases::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/aliases.json")?; @@ -271,7 +270,7 @@ fn test_appendix_c_cmp() { fn test_basic() { use icu_locid::subtags::{language, region, script}; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs index 5dbaef1e814..70967e16df0 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/directionality.rs @@ -11,11 +11,8 @@ use icu_provider::prelude::*; impl DataProvider for crate::DatagenProvider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::(req)?; - let data: &cldr_serde::directionality::Resource = self - .source - .cldr()? - .core() - .read_and_parse("scriptMetadata.json")?; + let data: &cldr_serde::directionality::Resource = + self.cldr()?.core().read_and_parse("scriptMetadata.json")?; Ok(DataResponse { metadata: Default::default(), payload: Some(DataPayload::from_owned(ScriptDirectionV1::from(data))), @@ -54,7 +51,7 @@ impl From<&cldr_serde::directionality::Resource> for ScriptDirectionV1<'_> { fn test_basic() { use icu_locid::subtags::script; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs index 4b6b71adc74..374e926f07e 100644 --- a/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs +++ b/provider/datagen/src/transform/cldr/locale_canonicalizer/likely_subtags.rs @@ -2,8 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::transform::cldr::{cldr_serde, source::CoverageLevel}; -use crate::SourceData; +use crate::transform::cldr::cldr_serde; +use crate::CoverageLevel; use icu_locid::subtags::Language; use icu_locid::LanguageIdentifier; use icu_locid_transform::provider::*; @@ -14,7 +14,7 @@ use std::collections::{BTreeMap, HashSet}; impl DataProvider for crate::DatagenProvider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::(req)?; - let resources = LikelySubtagsResources::try_from_source_data(&self.source)?; + let resources = LikelySubtagsResources::try_from_cldr_cache(self.cldr()?)?; Ok(DataResponse { metadata: Default::default(), @@ -35,7 +35,7 @@ impl DataProvider for crate::DatagenProvider { req: DataRequest, ) -> Result, DataError> { self.check_req::(req)?; - let resources = LikelySubtagsResources::try_from_source_data(&self.source)?; + let resources = LikelySubtagsResources::try_from_cldr_cache(self.cldr()?)?; Ok(DataResponse { metadata: Default::default(), @@ -98,29 +98,19 @@ pub(crate) struct LikelySubtagsResources<'a> { } impl<'a> LikelySubtagsResources<'a> { - pub fn try_from_source_data( - source_data: &'a SourceData, + pub fn try_from_cldr_cache( + cache: &'a super::super::source::CldrCache, ) -> Result { - let likely_subtags: &cldr_serde::likely_subtags::Resource = source_data - .cldr()? + let likely_subtags: &cldr_serde::likely_subtags::Resource = cache .core() .read_and_parse("supplemental/likelySubtags.json")?; - let coverage_levels: &cldr_serde::coverage_levels::Resource = source_data - .cldr()? - .core() - .read_and_parse("coverageLevels.json")?; - Ok(Self::from_resources(likely_subtags, coverage_levels)) - } - - pub fn from_resources( - likely_subtags: &'a cldr_serde::likely_subtags::Resource, - coverage_levels: &'a cldr_serde::coverage_levels::Resource, - ) -> Self { + let coverage_levels: &cldr_serde::coverage_levels::Resource = + cache.core().read_and_parse("coverageLevels.json")?; let basic_plus_languages = Self::get_basic_plus_languages(coverage_levels); - Self { + Ok(Self { likely_subtags, basic_plus_languages, - } + }) } fn get_basic_plus_languages( @@ -278,7 +268,7 @@ pub(crate) fn transform<'x>( fn test_basic() { use icu_locid::subtags::{language, region, script}; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let result_common: DataPayload = provider .load(Default::default()) .unwrap() diff --git a/provider/datagen/src/transform/cldr/plurals/mod.rs b/provider/datagen/src/transform/cldr/plurals/mod.rs index d03cdd61643..8b884c803f8 100644 --- a/provider/datagen/src/transform/cldr/plurals/mod.rs +++ b/provider/datagen/src/transform/cldr/plurals/mod.rs @@ -11,16 +11,14 @@ use icu_provider::prelude::*; impl crate::DatagenProvider { fn get_rules_for(&self, key: DataKey) -> Result<&cldr_serde::plurals::Rules, DataError> { if key == CardinalV1Marker::KEY { - self.source - .cldr()? + self.cldr()? .core() .read_and_parse::("supplemental/plurals.json")? .supplemental .plurals_type_cardinal .as_ref() } else if key == OrdinalV1Marker::KEY { - self.source - .cldr()? + self.cldr()? .core() .read_and_parse::("supplemental/ordinals.json")? .supplemental @@ -89,7 +87,7 @@ impl From<&cldr_serde::plurals::LocalePluralRules> for PluralRulesV1<'static> { fn test_basic() { use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); // Spot-check locale 'cs' since it has some interesting entries let cs_rules: DataPayload = provider diff --git a/provider/datagen/src/transform/cldr/relativetime/mod.rs b/provider/datagen/src/transform/cldr/relativetime/mod.rs index c0d33452d28..15ae1333f05 100644 --- a/provider/datagen/src/transform/cldr/relativetime/mod.rs +++ b/provider/datagen/src/transform/cldr/relativetime/mod.rs @@ -70,22 +70,14 @@ macro_rules! make_data_provider { ($($marker: ident),+ $(,)?) => { $( impl DataProvider<$marker> for crate::DatagenProvider { - fn load( - &self, - req: DataRequest, - ) -> Result, DataError> { + fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::<$marker>(req)?; let langid = req.locale.get_langid(); let resource: &cldr_serde::date_fields::Resource = self - .source .cldr()? .dates("gregorian") .read_and_parse(&langid, "dateFields.json")?; - let fields = &resource - .main - .value - .dates - .fields; + let fields = &resource.main.value.dates.fields; let field = datakey_filters() .get(&$marker::KEY) @@ -105,7 +97,6 @@ macro_rules! make_data_provider { impl IterableDataProvider<$marker> for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { Ok(self - .source .cldr()? .dates("gregorian") .list_langs()? @@ -113,7 +104,6 @@ macro_rules! make_data_provider { .collect()) } } - )+ }; } @@ -197,7 +187,7 @@ mod tests { #[test] fn test_basic() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { locale: &locale!("en").into(), @@ -217,7 +207,7 @@ mod tests { #[test] fn test_singular_sub_pattern() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let data: DataPayload = provider .load(DataRequest { locale: &locale!("ar").into(), diff --git a/provider/datagen/src/transform/cldr/source.rs b/provider/datagen/src/transform/cldr/source.rs index 4c90a0e1def..d984008a494 100644 --- a/provider/datagen/src/transform/cldr/source.rs +++ b/provider/datagen/src/transform/cldr/source.rs @@ -4,8 +4,8 @@ #![allow(dead_code)] // features -use super::locale_canonicalizer::likely_subtags::LikelySubtagsResources; use crate::source::SerdeCache; +use crate::CoverageLevel; use icu_locid::LanguageIdentifier; use icu_locid_transform::provider::LikelySubtagsForLanguageV1Marker; use icu_locid_transform::provider::LikelySubtagsForScriptRegionV1Marker; @@ -15,30 +15,10 @@ use icu_provider::DataError; use icu_provider_adapters::any_payload::AnyPayloadProvider; use icu_provider_adapters::fork::ForkByKeyProvider; use once_cell::sync::OnceCell; +use std::collections::HashSet; use std::fmt::Debug; use std::str::FromStr; -/// A language's CLDR coverage level. -#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] -#[non_exhaustive] -pub enum CoverageLevel { - /// Locales listed as modern coverage targets by the CLDR subcomittee. - /// - /// This is the highest level of coverage. - #[serde(rename = "modern")] - Modern, - /// Locales listed as moderate coverage targets by the CLDR subcomittee. - /// - /// This is a medium level of coverage. - #[serde(rename = "moderate")] - Moderate, - /// Locales listed as basic coverage targets by the CLDR subcomittee. - /// - /// This is the lowest level of coverage. - #[serde(rename = "basic")] - Basic, -} - #[derive(Debug)] pub(crate) struct CldrCache { serde_cache: SerdeCache, @@ -88,8 +68,9 @@ impl CldrCache { pub fn locales( &self, - levels: &[CoverageLevel], + levels: impl IntoIterator, ) -> Result, DataError> { + let levels = levels.into_iter().collect::>(); Ok(self .serde_cache .read_and_parse_json::( @@ -99,6 +80,8 @@ impl CldrCache { .iter() .filter_map(|(locale, c)| levels.contains(c).then_some(locale)) .cloned() + // `und` needs to be part of every set + .chain([Default::default()]) .collect()) } @@ -115,15 +98,9 @@ impl CldrCache { } fn locale_expander(&self) -> Result<&LocaleExpander, DataError> { + use super::locale_canonicalizer::likely_subtags::*; self.locale_expander.get_or_try_init(|| { - let resources = LikelySubtagsResources::from_resources( - self.serde_cache - .read_and_parse_json("cldr-core/supplemental/likelySubtags.json")?, - self.serde_cache - .read_and_parse_json("cldr-core/coverageLevels.json")?, - ); - let data = - super::locale_canonicalizer::likely_subtags::transform(resources.get_common()); + let data = transform(LikelySubtagsResources::try_from_cldr_cache(self)?.get_common()); let provider = ForkByKeyProvider::new( AnyPayloadProvider::from_owned::( data.clone().into(), diff --git a/provider/datagen/src/transform/cldr/time_zones/mod.rs b/provider/datagen/src/transform/cldr/time_zones/mod.rs index 9692fa91f32..7bd390dcce7 100644 --- a/provider/datagen/src/transform/cldr/time_zones/mod.rs +++ b/provider/datagen/src/transform/cldr/time_zones/mod.rs @@ -33,34 +33,26 @@ macro_rules! impl_data_provider { let langid = req.locale.get_langid(); let resource: &cldr_serde::time_zones::time_zone_names::Resource = self - .source .cldr()? .dates("gregorian") .read_and_parse(&langid, "timeZoneNames.json")?; - let time_zone_names_resource = &resource - .main - .value - .dates - .time_zone_names; + let time_zone_names_resource = &resource.main.value.dates.time_zone_names; - let resource: &cldr_serde::time_zones::bcp47_tzid::Resource = self - .source - .cldr()? - .bcp47() - .read_and_parse("timezone.json")?; + let resource: &cldr_serde::time_zones::bcp47_tzid::Resource = + self.cldr()?.bcp47().read_and_parse("timezone.json")?; let bcp47_tzids_resource = &resource.keyword.u.time_zones.values; let resource: &cldr_serde::time_zones::meta_zones::Resource = self - .source .cldr()? .core() .read_and_parse("supplemental/metaZones.json")?; let meta_zone_ids_resource = &resource.supplemental.meta_zones.meta_zone_ids.0; - let meta_zone_periods_resource = &resource.supplemental.meta_zones.meta_zone_info.time_zone.0; + let meta_zone_periods_resource = + &resource.supplemental.meta_zones.meta_zone_info.time_zone.0; Ok(DataResponse { metadata: Default::default(), @@ -82,19 +74,16 @@ macro_rules! impl_data_provider { // MetazonePeriodV1 does not require localized time zone data Ok(vec![Default::default()]) } else { - - Ok(self - .source - .cldr()? - .dates("gregorian") - .list_langs()? - .map(DataLocale::from) - .collect()) + Ok(self + .cldr()? + .dates("gregorian") + .list_langs()? + .map(DataLocale::from) + .collect()) } } } )+ - }; } @@ -119,7 +108,7 @@ mod tests { fn basic_cldr_time_zones() { use icu_locid::langid; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let time_zone_formats: DataPayload = provider .load(DataRequest { diff --git a/provider/datagen/src/transform/icuexport/collator/mod.rs b/provider/datagen/src/transform/icuexport/collator/mod.rs index 1fb44a4ed2a..b99f8dcb9c4 100644 --- a/provider/datagen/src/transform/icuexport/collator/mod.rs +++ b/provider/datagen/src/transform/icuexport/collator/mod.rs @@ -14,7 +14,6 @@ use icu_locid::LanguageIdentifier; use icu_locid::Locale; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; -use std::collections::HashSet; use std::convert::TryFrom; use std::str::FromStr; use writeable::Writeable; @@ -22,20 +21,18 @@ use zerovec::ZeroVec; mod collator_serde; -// Collations removed by default from ICU4X data, plus all starting with "search". -static DEFAULT_REMOVED_COLLATIONS: &[&str] = &["big5han", "gb2312"]; - -/// Backward compatibility for https://unicode-org.atlassian.net/browse/CLDR-15603 -fn has_legacy_swedish_variants(source: &crate::SourceData) -> bool { - source - .icuexport() - .and_then(|i| { - i.file_exists(&format!( - "collation/{}/sv_reformed_meta.toml", - source.collation_han_database, - )) - }) - .unwrap_or(false) +impl crate::DatagenProvider { + /// Backward compatibility for https://unicode-org.atlassian.net/browse/CLDR-15603 + fn has_legacy_swedish_variants(&self) -> bool { + self.icuexport() + .and_then(|i| { + i.file_exists(&format!( + "collation/{}/sv_reformed_meta.toml", + self.collation_han_database(), + )) + }) + .unwrap_or(false) + } } fn locale_to_file_name(locale: &DataLocale, has_legacy_swedish_variants: bool) -> String { @@ -108,31 +105,6 @@ fn file_name_to_locale(file_name: &str, has_legacy_swedish_variants: bool) -> Op Some(locale) } -pub(crate) fn filter_data_locales( - locales: HashSet, - collations: &HashSet, -) -> HashSet { - locales - .into_iter() - .filter(|locale| { - locale - .get_unicode_ext(&key!("co")) - .and_then(|co| co.as_single_subtag().copied()) - .map(|collation| { - if collations.contains(collation.as_str()) { - true - } else if collation.starts_with("search") { - // Note: literal "search" and "searchjl" are handled above - collations.contains("search*") - } else { - !DEFAULT_REMOVED_COLLATIONS.contains(&collation.as_str()) - } - }) - .unwrap_or(true) - }) - .collect() -} - macro_rules! collation_provider { ($(($marker:ident, $serde_struct:ident, $suffix:literal, $conversion:expr)),+, $toml_data:ident) => { $( @@ -140,12 +112,11 @@ macro_rules! collation_provider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::<$marker>(req)?; let $toml_data: &collator_serde::$serde_struct = self - .source .icuexport()? .read_and_parse_toml(&format!( "collation/{}/{}{}.toml", - self.source.collation_han_database, - locale_to_file_name(&req.locale, has_legacy_swedish_variants(&self.source)), + self.collation_han_database(), + locale_to_file_name(&req.locale, self.has_legacy_swedish_variants()), $suffix )) .map_err(|e| match e.kind { @@ -172,11 +143,11 @@ macro_rules! collation_provider { return Ok(vec![Default::default()]) } Ok(self - .source + .icuexport()? .list(&format!( "collation/{}", - self.source.collation_han_database + self.collation_han_database() ))? .filter_map(|mut file_name| { file_name.truncate(file_name.len() - ".toml".len()); @@ -185,7 +156,7 @@ macro_rules! collation_provider { file_name }) }) - .filter_map(|s| file_name_to_locale(&s, has_legacy_swedish_variants(&self.source))) + .filter_map(|s| file_name_to_locale(&s, self.has_legacy_swedish_variants())) .map(DataLocale::from) .collect()) } @@ -252,113 +223,3 @@ collation_provider!( ), toml_data ); - -#[test] -fn test_collation_filtering() { - use crate::options; - use icu_locid::langid; - use std::collections::BTreeSet; - - #[derive(Debug)] - struct TestCase<'a> { - include_collations: &'a [&'a str], - language: LanguageIdentifier, - expected: &'a [&'a str], - } - let cases = [ - TestCase { - include_collations: &[], - language: langid!("zh"), - expected: &["zh", "zh-u-co-stroke", "zh-u-co-unihan", "zh-u-co-zhuyin"], - }, - TestCase { - include_collations: &["gb2312"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-gb2312", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &["big5han"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-big5han", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &["gb2312", "search*"], - language: langid!("zh"), - expected: &[ - "zh", - "zh-u-co-gb2312", - "zh-u-co-stroke", - "zh-u-co-unihan", - "zh-u-co-zhuyin", - ], - }, - TestCase { - include_collations: &[], - language: langid!("ko"), - expected: &["ko", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["searchjl"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search", "searchjl"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - TestCase { - include_collations: &["search*", "big5han"], - language: langid!("ko"), - expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], - }, - ]; - for cas in cases { - let provider = crate::DatagenProvider::for_test(); - let mut options = options::Options::default(); - options.collations = cas - .include_collations - .iter() - .copied() - .map(String::from) - .collect(); - options.locales = - crate::options::LocaleInclude::Explicit([cas.language.clone()].into_iter().collect()); - options.fallback = crate::options::FallbackMode::Preresolved; - - let resolved_locales = provider - .select_locales_for_key( - CollationDataV1Marker::KEY, - &options, - &once_cell::sync::Lazy::new(|| unreachable!()), - ) - .unwrap() - .into_iter() - .map(|l| l.to_string()) - .collect::>(); - let expected_locales = cas - .expected - .iter() - .copied() - .map(String::from) - .collect::>(); - assert_eq!(resolved_locales, expected_locales, "{cas:?}"); - } -} diff --git a/provider/datagen/src/transform/icuexport/normalizer/mod.rs b/provider/datagen/src/transform/icuexport/normalizer/mod.rs index dd964e64d3a..4d21087a859 100644 --- a/provider/datagen/src/transform/icuexport/normalizer/mod.rs +++ b/provider/datagen/src/transform/icuexport/normalizer/mod.rs @@ -23,9 +23,10 @@ macro_rules! normalization_provider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::<$marker>(req)?; let $toml_data: &normalizer_serde::$serde_struct = - self.source.icuexport()?.read_and_parse_toml(&format!( + self.icuexport()?.read_and_parse_toml(&format!( "norm/{}/{}.toml", - self.source.trie_type, $file_name + self.trie_type(), + $file_name ))?; $conversion diff --git a/provider/datagen/src/transform/icuexport/ucase/mod.rs b/provider/datagen/src/transform/icuexport/ucase/mod.rs index d3018d93d92..e7a90918fd9 100644 --- a/provider/datagen/src/transform/icuexport/ucase/mod.rs +++ b/provider/datagen/src/transform/icuexport/ucase/mod.rs @@ -17,11 +17,10 @@ impl DataProvider for crate::DatagenProvider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::(req)?; let toml = &self - .source .icuexport()? .read_and_parse_toml::(&format!( "ucase/{}/ucase.toml", - self.source.trie_type + self.trie_type() ))? .ucase; @@ -57,11 +56,10 @@ impl DataProvider for crate::DatagenProvider { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::(req)?; let toml = &self - .source .icuexport()? .read_and_parse_toml::(&format!( "ucase/{}/ucase.toml", - self.source.trie_type + self.trie_type() ))? .ucase; diff --git a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs index e7f14b39681..c63352ab6e8 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bidi_data.rs @@ -2,27 +2,26 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] -use crate::SourceData; - use icu_properties::provider::bidi_data::BidiAuxiliaryPropertiesV1Marker; use icu_provider::datagen::*; use icu_provider::prelude::*; #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] -fn get_code_point_prop_map<'a>( - source: &'a SourceData, - key: &str, -) -> Result<&'a super::uprops_serde::code_point_prop::CodePointPropertyMap, DataError> { - source - .icuexport()? - .read_and_parse_toml::(&format!( - "uprops/{}/{}.toml", - source.trie_type, key - ))? - .enum_property - .get(0) - .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) +impl crate::DatagenProvider { + fn get_code_point_prop_map<'a>( + &'a self, + key: &str, + ) -> Result<&'a super::uprops_serde::code_point_prop::CodePointPropertyMap, DataError> { + self.icuexport()? + .read_and_parse_toml::(&format!( + "uprops/{}/{}.toml", + self.trie_type(), + key + ))? + .enum_property + .get(0) + .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) + } } // implement data provider 2 different ways, based on whether or not @@ -33,7 +32,6 @@ impl DataProvider for crate::DatagenProvider { &self, req: DataRequest, ) -> Result, DataError> { - use crate::transform::icuexport::uprops::{bin_cp_set, enum_codepointtrie}; use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData}; use icu_collections::codepointinvlist::CodePointInversionListBuilder; use icu_collections::codepointtrie::CodePointTrie; @@ -45,7 +43,7 @@ impl DataProvider for crate::DatagenProvider { self.check_req::(req)?; // Bidi_M / Bidi_Mirrored - let bidi_m_data = bin_cp_set::get_binary_prop_for_code_point_set(&self.source, "Bidi_M")?; + let bidi_m_data = self.get_binary_prop_for_code_point_set("Bidi_M")?; let mut bidi_m_builder = CodePointInversionListBuilder::new(); for (start, end) in &bidi_m_data.ranges { bidi_m_builder.add_range_u32(&(start..=end)); @@ -53,14 +51,13 @@ impl DataProvider for crate::DatagenProvider { let bidi_m_cpinvlist = bidi_m_builder.build(); // bmg / Bidi_Mirroring_Glyph - let bmg_data = &get_code_point_prop_map(&self.source, "bmg")?.code_point_trie; + let bmg_data = &self.get_code_point_prop_map("bmg")?.code_point_trie; let bmg_trie = CodePointTrie::try_from(bmg_data).map_err(|e| { DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e) })?; // bpt / Bidi_Paired_Bracket_Type - let bpt_data = - &enum_codepointtrie::get_enumerated_prop(&self.source, "bpt")?.code_point_trie; + let bpt_data = &self.get_enumerated_prop("bpt")?.code_point_trie; let bpt_trie = CodePointTrie::try_from(bpt_data).map_err(|e| { DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e) })?; @@ -125,7 +122,7 @@ mod tests { #[test] fn test_bidi_data_provider() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs index b393b278e04..e5bfcc2c605 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_cp_set.rs @@ -2,26 +2,27 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::SourceData; use icu_collections::codepointinvlist::CodePointInversionListBuilder; use icu_properties::provider::*; use icu_provider::datagen::*; use icu_provider::prelude::*; -// get the source data for a Unicode binary property that only defines values for code points -pub(crate) fn get_binary_prop_for_code_point_set<'a>( - source: &'a SourceData, - key: &str, -) -> Result<&'a super::uprops_serde::binary::BinaryProperty, DataError> { - source - .icuexport()? - .read_and_parse_toml::(&format!( - "uprops/{}/{}.toml", - source.trie_type, key - ))? - .binary_property - .get(0) - .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) +impl crate::DatagenProvider { + // get the source data for a Unicode binary property that only defines values for code points + pub(crate) fn get_binary_prop_for_code_point_set<'a>( + &'a self, + key: &str, + ) -> Result<&'a super::uprops_serde::binary::BinaryProperty, DataError> { + self.icuexport()? + .read_and_parse_toml::(&format!( + "uprops/{}/{}.toml", + self.trie_type(), + key + ))? + .binary_property + .get(0) + .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) + } } macro_rules! expand { @@ -33,7 +34,7 @@ macro_rules! expand { req: DataRequest, ) -> Result, DataError> { self.check_req::<$marker>(req)?; - let data = get_binary_prop_for_code_point_set(&self.source, $prop_name)?; + let data = self.get_binary_prop_for_code_point_set($prop_name)?; let mut builder = CodePointInversionListBuilder::new(); for (start, end) in &data.ranges { @@ -54,7 +55,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_binary_prop_for_code_point_set(&self.source, $prop_name)?; + self.get_binary_prop_for_code_point_set($prop_name)?; Ok(vec![Default::default()]) } @@ -137,7 +138,7 @@ fn test_basic() { use icu_properties::provider::PropertyCodePointSetV1; use icu_properties::provider::WhiteSpaceV1Marker; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs index 93d50ae2855..2afb8171f7c 100644 --- a/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs +++ b/provider/datagen/src/transform/icuexport/uprops/bin_uniset.rs @@ -2,7 +2,6 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::SourceData; use icu_collections::codepointinvlist::CodePointInversionListBuilder; use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList; use icu_properties::provider::*; @@ -10,19 +9,21 @@ use icu_provider::datagen::*; use icu_provider::prelude::*; use zerovec::VarZeroVec; -fn get_binary_prop_for_unicodeset<'a>( - source: &'a SourceData, - key: &str, -) -> Result<&'a super::uprops_serde::binary::BinaryProperty, DataError> { - source - .icuexport()? - .read_and_parse_toml::(&format!( - "uprops/{}/{}.toml", - source.trie_type, key - ))? - .binary_property - .get(0) - .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) +impl crate::DatagenProvider { + fn get_binary_prop_for_unicodeset<'a>( + &'a self, + key: &str, + ) -> Result<&'a super::uprops_serde::binary::BinaryProperty, DataError> { + self.icuexport()? + .read_and_parse_toml::(&format!( + "uprops/{}/{}.toml", + self.trie_type(), + key + ))? + .binary_property + .get(0) + .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) + } } macro_rules! expand { @@ -34,7 +35,7 @@ macro_rules! expand { req: DataRequest, ) -> Result, DataError> { self.check_req::<$marker>(req)?; - let data = get_binary_prop_for_unicodeset(&self.source, $prop_name)?; + let data = self.get_binary_prop_for_unicodeset($prop_name)?; let mut builder = CodePointInversionListBuilder::new(); for (start, end) in &data.ranges { @@ -61,7 +62,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_binary_prop_for_unicodeset(&self.source, $prop_name)?; + self.get_binary_prop_for_unicodeset($prop_name)?; Ok(vec![Default::default()]) } @@ -78,7 +79,7 @@ fn test_basic() { use icu_properties::provider::BasicEmojiV1Marker; use icu_properties::provider::PropertyUnicodeSetV1; - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs index 30bdb277272..11cc83326e6 100644 --- a/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs +++ b/provider/datagen/src/transform/icuexport/uprops/enum_codepointtrie.rs @@ -3,7 +3,6 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::transform::icuexport::uprops::uprops_serde::enumerated::EnumeratedPropertyMap; -use crate::SourceData; use icu_collections::codepointtrie::CodePointTrie; use icu_properties::provider::{names::*, *}; use icu_provider::datagen::*; @@ -12,19 +11,38 @@ use std::collections::BTreeMap; use std::convert::TryFrom; use tinystr::TinyStr4; -pub(crate) fn get_enumerated_prop<'a>( - source: &'a SourceData, - key: &str, -) -> Result<&'a super::uprops_serde::enumerated::EnumeratedPropertyMap, DataError> { - source - .icuexport()? - .read_and_parse_toml::(&format!( - "uprops/{}/{}.toml", - source.trie_type, key - ))? - .enum_property - .get(0) - .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) +impl crate::DatagenProvider { + pub(crate) fn get_enumerated_prop<'a>( + &'a self, + key: &str, + ) -> Result<&'a super::uprops_serde::enumerated::EnumeratedPropertyMap, DataError> { + self.icuexport()? + .read_and_parse_toml::(&format!( + "uprops/{}/{}.toml", + self.trie_type(), + key + ))? + .enum_property + .get(0) + .ok_or_else(|| DataErrorKind::MissingDataKey.into_error()) + } + fn get_mask_prop<'a>( + &'a self, + key: &str, + ) -> Result<&'a super::uprops_serde::mask::MaskPropertyMap, DataError> { + self.icuexport()? + .read_and_parse_toml::(&format!( + "uprops/{}/{}.toml", + self.trie_type(), + key + ))? + .mask_property + .get(0) + .ok_or(DataError::custom( + "Loading icuexport property data failed: \ + Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)", + )) + } } fn get_prop_values_map( @@ -136,7 +154,7 @@ fn load_values_to_names_sparse( where M: DataMarker>, { - let data = get_enumerated_prop(&p.source, prop_name) + let data = p.get_enumerated_prop(prop_name) .map_err(|_| DataError::custom("Loading icuexport property data failed: \ Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)"))?; let map = load_values_to_names(data, is_short)?; @@ -157,7 +175,7 @@ fn load_values_to_names_linear( where M: DataMarker>, { - let data = get_enumerated_prop(&p.source, prop_name) + let data = p.get_enumerated_prop(prop_name) .map_err(|_| DataError::custom("Loading icuexport property data failed: \ Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)"))?; let map = load_values_to_names(data, is_short)?; @@ -179,7 +197,7 @@ fn load_values_to_names_linear4( where M: DataMarker>, { - let data = get_enumerated_prop(&p.source, prop_name) + let data = p.get_enumerated_prop(prop_name) .map_err(|_| DataError::custom("Loading icuexport property data failed: \ Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)"))?; let map = load_values_to_names(data, is_short)?; @@ -212,7 +230,7 @@ macro_rules! expand { { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::<$marker>(req)?; - let source_cpt_data = &get_enumerated_prop(&self.source, $prop_name)?.code_point_trie; + let source_cpt_data = &self.get_enumerated_prop($prop_name)?.code_point_trie; let code_point_trie = CodePointTrie::try_from(source_cpt_data).map_err(|e| { DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e) @@ -229,7 +247,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -238,7 +256,7 @@ macro_rules! expand { { fn load(&self, req: DataRequest) -> Result, DataError> { self.check_req::<$marker_n2e>(req)?; - let data = get_enumerated_prop(&self.source, $prop_name) + let data = self.get_enumerated_prop($prop_name) .map_err(|_| DataError::custom("Loading icuexport property data failed: \ Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)"))?; @@ -254,7 +272,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -272,7 +290,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -289,7 +307,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -308,7 +326,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -325,7 +343,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -344,7 +362,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -362,7 +380,7 @@ macro_rules! expand { fn supported_locales( &self, ) -> Result, DataError> { - get_enumerated_prop(&self.source, $prop_name)?; + self.get_enumerated_prop($prop_name)?; Ok(vec![Default::default()]) } } @@ -371,22 +389,6 @@ macro_rules! expand { }; } -fn get_mask_prop<'a>( - source: &'a SourceData, - key: &str, -) -> Result<&'a super::uprops_serde::mask::MaskPropertyMap, DataError> { - source - .icuexport()? - .read_and_parse_toml::(&format!( - "uprops/{}/{}.toml", - source.trie_type, - key - ))? - .mask_property - .get(0) - .ok_or(DataError::custom("Loading icuexport property data failed: \ - Are you using a sufficiently recent icuexport? (Must be ⪈ 72.1)")) -} // Special handling for GeneralCategoryMask impl DataProvider for crate::DatagenProvider { fn load( @@ -398,7 +400,7 @@ impl DataProvider for crate::DatagenProv self.check_req::(req)?; - let data = get_mask_prop(&self.source, "gcm")?; + let data = self.get_mask_prop("gcm")?; let data_struct = get_prop_values_map(&data.values, |v| { let value: GeneralCategoryGroup = v.into(); let ule = value.to_unaligned(); @@ -419,7 +421,7 @@ impl DataProvider for crate::DatagenProv impl IterableDataProvider for crate::DatagenProvider { fn supported_locales(&self) -> Result, DataError> { - get_mask_prop(&self.source, "gcm")?; + self.get_mask_prop("gcm")?; Ok(vec![Default::default()]) } } @@ -523,7 +525,7 @@ mod tests { // the ICU CodePointTrie that ICU4X is reading from. #[test] fn test_general_category() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) @@ -541,7 +543,7 @@ mod tests { #[test] fn test_script() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/icuexport/uprops/script.rs b/provider/datagen/src/transform/icuexport/uprops/script.rs index 5f281e26fa3..d18976be14f 100644 --- a/provider/datagen/src/transform/icuexport/uprops/script.rs +++ b/provider/datagen/src/transform/icuexport/uprops/script.rs @@ -21,11 +21,10 @@ impl DataProvider for crate::DatagenProvid ) -> Result, DataError> { self.check_req::(req)?; let scx_data = self - .source .icuexport()? .read_and_parse_toml::(&format!( "uprops/{}/scx.toml", - self.source.trie_type, + self.trie_type(), ))? .script_extensions .get(0) @@ -73,7 +72,7 @@ mod tests { #[test] fn test_script_val_from_script_extensions() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) @@ -93,7 +92,7 @@ mod tests { #[test] fn test_scx_array_from_script_extensions() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) @@ -167,7 +166,7 @@ mod tests { #[test] fn test_has_script() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) @@ -247,7 +246,7 @@ mod tests { #[test] fn test_get_script_extensions_set() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) diff --git a/provider/datagen/src/transform/mod.rs b/provider/datagen/src/transform/mod.rs index cda63311103..d514e83f7fa 100644 --- a/provider/datagen/src/transform/mod.rs +++ b/provider/datagen/src/transform/mod.rs @@ -43,7 +43,7 @@ impl DatagenProvider { #[test] fn test_missing_locale() { use icu_locid::langid; - let provider = DatagenProvider::for_test(); + let provider = DatagenProvider::latest_tested_offline_subset(); assert!(DataProvider::::load( &provider, DataRequest { diff --git a/provider/datagen/src/transform/segmenter/dictionary.rs b/provider/datagen/src/transform/segmenter/dictionary.rs index 0d784c97cd4..32e445d61fb 100644 --- a/provider/datagen/src/transform/segmenter/dictionary.rs +++ b/provider/datagen/src/transform/segmenter/dictionary.rs @@ -2,7 +2,6 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::options; use icu_locid::{langid, locale}; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; @@ -38,23 +37,6 @@ pub(crate) fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static } } -pub(crate) fn filter_data_locales( - locales: HashSet, - segmenter_models: &options::SegmenterModelInclude, -) -> HashSet { - match segmenter_models { - options::SegmenterModelInclude::Recommended => locales, - options::SegmenterModelInclude::None => Default::default(), - options::SegmenterModelInclude::Explicit(list) => locales - .into_iter() - .filter(|locale| { - list.iter() - .any(|x| Some(x.as_str()) == data_locale_to_model_name(locale)) - }) - .collect(), - } -} - impl crate::DatagenProvider { fn load_dictionary_data( &self, @@ -65,19 +47,22 @@ impl crate::DatagenProvider { let filename = format!("segmenter/dictionary/{model}.toml"); - let toml_data: &SegmenterDictionaryData = self - .source + let toml_data = self .icuexport() - .and_then(|e| e.read_and_parse_toml(&filename)) - .or_else(|e| { - self.source - .icuexport_fallback() - .read_and_parse_toml(&filename) - .map_err(|_| e) - })?; + .and_then(|e| e.read_and_parse_toml::(&filename)); + + #[cfg(feature = "legacy_api")] + #[allow(deprecated)] + let toml_data = toml_data.or_else(|e| { + self.source + .icuexport_dictionary_fallback + .as_ref() + .ok_or(e)? + .read_and_parse_toml(&filename) + }); Ok(UCharDictionaryBreakDataV1 { - trie_data: ZeroVec::alloc_from_slice(&toml_data.trie_data), + trie_data: ZeroVec::alloc_from_slice(&toml_data?.trie_data), }) } } diff --git a/provider/datagen/src/transform/segmenter/lstm.rs b/provider/datagen/src/transform/segmenter/lstm.rs index 38a9ee3b6d3..1de36627ecf 100644 --- a/provider/datagen/src/transform/segmenter/lstm.rs +++ b/provider/datagen/src/transform/segmenter/lstm.rs @@ -4,7 +4,6 @@ //! This module contains provider implementations backed by LSTM segmentation data. -use crate::options; use icu_locid::langid; use icu_provider::datagen::IterableDataProvider; use icu_provider::prelude::*; @@ -203,23 +202,6 @@ pub(crate) fn data_locale_to_model_name(locale: &DataLocale) -> Option<&'static } } -pub(crate) fn filter_data_locales( - locales: HashSet, - segmenter_models: &options::SegmenterModelInclude, -) -> HashSet { - match &segmenter_models { - options::SegmenterModelInclude::Recommended => locales, - options::SegmenterModelInclude::None => Default::default(), - options::SegmenterModelInclude::Explicit(list) => locales - .into_iter() - .filter(|locale| { - list.iter() - .any(|x| Some(x.as_str()) == data_locale_to_model_name(locale)) - }) - .collect(), - } -} - impl DataProvider for crate::DatagenProvider { fn load( &self, @@ -231,7 +213,6 @@ impl DataProvider for crate::DatagenProvider { .ok_or(DataErrorKind::MissingLocale.with_req(LstmForWordLineAutoV1Marker::KEY, req))?; let lstm_data = self - .source .segmenter_lstm()? .read_and_parse_json::(&format!("{model}/weights.json")) .map_err(|_| DataErrorKind::MissingLocale.into_error())?; @@ -268,9 +249,8 @@ mod tests { #[test] fn thai_word_break_with_grapheme_model() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let raw_data = provider - .source .segmenter_lstm() .unwrap() .read_and_parse_json::("Thai_graphclust_model4_heavy/weights.json") diff --git a/provider/datagen/src/transform/segmenter/mod.rs b/provider/datagen/src/transform/segmenter/mod.rs index b7702712dc6..317a0dab42d 100644 --- a/provider/datagen/src/transform/segmenter/mod.rs +++ b/provider/datagen/src/transform/segmenter/mod.rs @@ -585,9 +585,9 @@ impl crate::DatagenProvider { data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map), default_value: 0, error_value: 0, - trie_type: match self.source.trie_type { - crate::source::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, - crate::source::TrieType::Small => icu_collections::codepointtrie::TrieType::Small, + trie_type: match self.trie_type() { + crate::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, + crate::TrieType::Small => icu_collections::codepointtrie::TrieType::Small, }, } .build(); @@ -654,7 +654,7 @@ macro_rules! implement { return Ok(DataResponse { metadata: DataResponseMetadata::default(), payload: Some(DataPayload::from_owned( - self.generate_rule_break_data(include_str!(concat!("../../../data/segmenter/rules/", $rules))), + self.generate_rule_break_data(include_str!(concat!("../../../data/segmenter_rules/", $rules))), )), }); } @@ -679,7 +679,7 @@ mod tests { #[test] fn load_grapheme_cluster_data() { - let provider = crate::DatagenProvider::for_test(); + let provider = crate::DatagenProvider::latest_tested_offline_subset(); let payload: DataPayload = provider .load(Default::default()) .expect("Loading should succeed!") diff --git a/provider/datagen/data/segmenter/dictionary/burmesedict.toml b/provider/datagen/tests/data/icuexport/segmenter/dictionary/burmesedict.toml similarity index 100% rename from provider/datagen/data/segmenter/dictionary/burmesedict.toml rename to provider/datagen/tests/data/icuexport/segmenter/dictionary/burmesedict.toml diff --git a/provider/datagen/data/segmenter/dictionary/cjdict.toml b/provider/datagen/tests/data/icuexport/segmenter/dictionary/cjdict.toml similarity index 100% rename from provider/datagen/data/segmenter/dictionary/cjdict.toml rename to provider/datagen/tests/data/icuexport/segmenter/dictionary/cjdict.toml diff --git a/provider/datagen/data/segmenter/dictionary/khmerdict.toml b/provider/datagen/tests/data/icuexport/segmenter/dictionary/khmerdict.toml similarity index 100% rename from provider/datagen/data/segmenter/dictionary/khmerdict.toml rename to provider/datagen/tests/data/icuexport/segmenter/dictionary/khmerdict.toml diff --git a/provider/datagen/data/segmenter/dictionary/laodict.toml b/provider/datagen/tests/data/icuexport/segmenter/dictionary/laodict.toml similarity index 100% rename from provider/datagen/data/segmenter/dictionary/laodict.toml rename to provider/datagen/tests/data/icuexport/segmenter/dictionary/laodict.toml diff --git a/provider/datagen/data/segmenter/dictionary/thaidict.toml b/provider/datagen/tests/data/icuexport/segmenter/dictionary/thaidict.toml similarity index 100% rename from provider/datagen/data/segmenter/dictionary/thaidict.toml rename to provider/datagen/tests/data/icuexport/segmenter/dictionary/thaidict.toml diff --git a/provider/datagen/data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json b/provider/datagen/tests/data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json similarity index 100% rename from provider/datagen/data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json rename to provider/datagen/tests/data/lstm/Burmese_codepoints_exclusive_model4_heavy/weights.json diff --git a/provider/datagen/data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json b/provider/datagen/tests/data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json similarity index 100% rename from provider/datagen/data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json rename to provider/datagen/tests/data/lstm/Khmer_codepoints_exclusive_model4_heavy/weights.json diff --git a/provider/datagen/data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json b/provider/datagen/tests/data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json similarity index 100% rename from provider/datagen/data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json rename to provider/datagen/tests/data/lstm/Lao_codepoints_exclusive_model4_heavy/weights.json diff --git a/provider/datagen/data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json b/provider/datagen/tests/data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json similarity index 100% rename from provider/datagen/data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json rename to provider/datagen/tests/data/lstm/Thai_codepoints_exclusive_model4_heavy/weights.json diff --git a/provider/datagen/data/lstm/Thai_graphclust_model4_heavy/weights.json b/provider/datagen/tests/data/lstm/Thai_graphclust_model4_heavy/weights.json similarity index 100% rename from provider/datagen/data/lstm/Thai_graphclust_model4_heavy/weights.json rename to provider/datagen/tests/data/lstm/Thai_graphclust_model4_heavy/weights.json diff --git a/provider/datagen/tests/make-testdata.rs b/provider/datagen/tests/make-testdata.rs index 4a1704d4a4f..4356b98aa6e 100644 --- a/provider/datagen/tests/make-testdata.rs +++ b/provider/datagen/tests/make-testdata.rs @@ -28,10 +28,12 @@ fn generate_json_and_verify_postcard() { let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/")); - let source = SourceData::offline() - .with_cldr(data_root.join("cldr"), Default::default()) + let source = DatagenProvider::default() + .with_cldr(data_root.join("cldr")) .unwrap() .with_icuexport(data_root.join("icuexport")) + .unwrap() + .with_segmenter_lstm(data_root.join("lstm")) .unwrap(); let json_out = Box::new( @@ -54,16 +56,14 @@ fn generate_json_and_verify_postcard() { ), }); - let mut options = options::Options::default(); - options.keys = icu_datagen::all_keys().into_iter().collect(); - options.locales = options::LocaleInclude::Explicit(LOCALES.iter().cloned().collect()); - options.segmenter_models = options::SegmenterModelInclude::Explicit(vec![ - "thaidict".into(), - "Thai_codepoints_exclusive_model4_heavy".into(), - ]); - - DatagenProvider::new(source) - .export(options, MultiExporter::new(vec![json_out, postcard_out])) + DatagenDriver::new() + .with_keys(icu_datagen::all_keys()) + .with_locales(LOCALES.iter().cloned()) + .with_segmenter_models(vec![ + "thaidict".into(), + "Thai_codepoints_exclusive_model4_heavy".into(), + ]) + .export(&source, MultiExporter::new(vec![json_out, postcard_out])) .unwrap(); } diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index f828e4de79f..9e9de5031a0 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -6,11 +6,9 @@ use std::collections::{BTreeMap, HashSet}; use std::path::Path; use elsa::sync::FrozenMap; -use icu_datagen::options::{FallbackMode, LocaleInclude, Options}; -use icu_datagen::{DatagenProvider, SourceData}; +use icu_datagen::prelude::*; use icu_decimal::provider::DecimalSymbolsV1Marker; -use icu_locid::{langid, LanguageIdentifier}; -use icu_provider::datagen::{DataExporter, ExportMarker}; +use icu_provider::datagen::ExportMarker; use icu_provider::prelude::*; use postcard::ser_flavors::{AllocVec, Flavor}; use writeable::Writeable; @@ -64,18 +62,15 @@ fn test_fallback_options() { let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/")); - let source = SourceData::offline() - .with_cldr(data_root.join("cldr"), Default::default()) + let provider = DatagenProvider::default() + .with_cldr(data_root.join("cldr")) .unwrap() .with_icuexport(data_root.join("icuexport")) .unwrap(); - let decimal_symbols_key: HashSet = [DecimalSymbolsV1Marker::KEY].into_iter().collect(); - let mut testing_exporter = TestingExporter::default(); - let mut options = Options::default(); - options.keys = decimal_symbols_key.clone(); + let driver = DatagenDriver::new().with_keys([DecimalSymbolsV1Marker::KEY]); let explicit_locales: HashSet = [ langid!("arc"), // Aramaic, not in CLDR @@ -91,10 +86,11 @@ fn test_fallback_options() { // // All+Hybrid // - options.locales = LocaleInclude::All; - options.fallback = FallbackMode::Hybrid; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_all_locales() + .with_fallback_mode(FallbackMode::Hybrid) + .export(&provider, &mut testing_exporter) .unwrap(); let data_all_hybrid = testing_exporter.take_map_and_reset(); @@ -133,9 +129,11 @@ fn test_fallback_options() { // All+Runtime // - options.fallback = FallbackMode::RuntimeManual; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_all_locales() + .with_fallback_mode(FallbackMode::RuntimeManual) + .export(&provider, &mut testing_exporter) .unwrap(); let data_all_runtime = testing_exporter.take_map_and_reset(); @@ -207,10 +205,11 @@ fn test_fallback_options() { // Explicit+Hybrid // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::Hybrid; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::Hybrid) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_hybrid = testing_exporter.take_map_and_reset(); @@ -242,10 +241,11 @@ fn test_fallback_options() { // Explicit+Runtime // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::RuntimeManual; - DatagenProvider::new(source.clone()) - .export(options.clone(), &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::RuntimeManual) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_runtime = testing_exporter.take_map_and_reset(); @@ -278,10 +278,11 @@ fn test_fallback_options() { // Explicit+Preresolved // - options.locales = LocaleInclude::Explicit(explicit_locales.clone()); - options.fallback = FallbackMode::Preresolved; - DatagenProvider { source } - .export(options, &mut testing_exporter) + driver + .clone() + .with_locales(explicit_locales.clone()) + .with_fallback_mode(FallbackMode::Preresolved) + .export(&provider, &mut testing_exporter) .unwrap(); let data_explicit_preresolved = testing_exporter.take_map_and_reset(); diff --git a/provider/fs/src/export/mod.rs b/provider/fs/src/export/mod.rs index b79a7407116..5459e2eb0f9 100644 --- a/provider/fs/src/export/mod.rs +++ b/provider/fs/src/export/mod.rs @@ -23,14 +23,10 @@ //! .expect("Should successfully initialize data output directory"); //! //! // Export something -//! DatagenProvider::default() -//! .export({ -//! let mut options = options::Options::default(); -//! options.keys = [icu_provider::hello_world::HelloWorldV1Marker::KEY].into_iter().collect(); -//! options -//! }, -//! exporter -//! ).unwrap(); +//! DatagenDriver::new() +//! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) +//! .export(&DatagenProvider::latest_tested(), exporter) +//! .unwrap(); //! # //! # let _ = std::fs::remove_dir_all(&demo_path); //! ``` diff --git a/provider/fs/tests/data/bincode.json b/provider/fs/tests/data/bincode.json index 20cec59afb1..69284821771 100644 --- a/provider/fs/tests/data/bincode.json +++ b/provider/fs/tests/data/bincode.json @@ -1,18 +1,19 @@ { - "keys": { - "Explicit": [ - "core/helloworld@1" - ] - }, - "locales": "All", - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Fs": { - "path": "bincode", - "syntax": "Bincode" - } - }, - "overwrite": true - } \ No newline at end of file + "keys": { + "explicit": [ + "core/helloworld@1" + ] + }, + "fallback": "hybrid", + "locales": "all", + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "fileSystem": { + "path": "bincode", + "syntax": "bincode" + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/provider/fs/tests/data/json.json b/provider/fs/tests/data/json.json index 6711841894f..256514211f8 100644 --- a/provider/fs/tests/data/json.json +++ b/provider/fs/tests/data/json.json @@ -1,18 +1,19 @@ { - "keys": { - "Explicit": [ - "core/helloworld@1" - ] - }, - "locales": "All", - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Fs": { - "path": "json", - "syntax": "Json" - } - }, - "overwrite": true - } \ No newline at end of file + "keys": { + "explicit": [ + "core/helloworld@1" + ] + }, + "fallback": "hybrid", + "locales": "all", + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "fileSystem": { + "path": "json", + "syntax": "json" + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/provider/fs/tests/data/postcard.json b/provider/fs/tests/data/postcard.json index 3e118721f8d..fee3f3117ae 100644 --- a/provider/fs/tests/data/postcard.json +++ b/provider/fs/tests/data/postcard.json @@ -1,18 +1,19 @@ { - "keys": { - "Explicit": [ - "core/helloworld@1" - ] - }, - "locales": "All", - "cldr": "None", - "icu_export": "None", - "segmenter_lstm": "None", - "export": { - "Fs": { - "path": "postcard", - "syntax": "Postcard" - } - }, - "overwrite": true - } \ No newline at end of file + "keys": { + "explicit": [ + "core/helloworld@1" + ] + }, + "fallback": "hybrid", + "locales": "all", + "cldr": "none", + "icuExport": "none", + "segmenterLstm": "none", + "export": { + "fileSystem": { + "path": "postcard", + "syntax": "postcard" + } + }, + "overwrite": true +} \ No newline at end of file diff --git a/provider/testdata/src/lib.rs b/provider/testdata/src/lib.rs index 6b5e6666383..8dfba69f112 100644 --- a/provider/testdata/src/lib.rs +++ b/provider/testdata/src/lib.rs @@ -100,18 +100,6 @@ pub mod versions { pub fn icu_tag() -> alloc::string::String { alloc::string::String::from(super::metadata::ICUEXPORT_TAG) } - - /// Gets the segmenter LSTM tag used as the test data source - /// - /// # Examples - /// - /// ``` - /// assert_eq!("v0.1.0", icu_testdata::versions::segmenter_lstm_tag()); - /// ``` - #[deprecated(since = "1.3.0", note = "use `compiled_data`")] - pub fn segmenter_lstm_tag() -> alloc::string::String { - alloc::string::String::from(super::metadata::SEGMENTER_LSTM_TAG) - } } /// Gets the locales supported by the test data. diff --git a/tools/testdata-scripts/globs.rs.data b/tools/testdata-scripts/globs.rs.data index d593790fbb3..d7cff13f007 100644 --- a/tools/testdata-scripts/globs.rs.data +++ b/tools/testdata-scripts/globs.rs.data @@ -85,6 +85,11 @@ const ICUEXPORTDATA_GLOB: &[&str] = &[ "norm/small/nfkd.toml", "norm/small/nfkdex.toml", "norm/small/uts46d.toml", + "segmenter/dictionary/burmesedict.toml", + "segmenter/dictionary/cjdict.toml", + "segmenter/dictionary/khmerdict.toml", + "segmenter/dictionary/laodict.toml", + "segmenter/dictionary/thaidict.toml", "ucase/small/ucase.toml", "uprops/small/AHex.toml", "uprops/small/alnum.toml", @@ -167,14 +172,6 @@ const ICUEXPORTDATA_GLOB: &[&str] = &[ "uprops/small/XIDS.toml", ]; -const ICUEXPORTDATA_SEGMENTER_GLOB: &[&str] = &[ - "segmenter/dictionary/burmesedict.toml", - "segmenter/dictionary/cjdict.toml", - "segmenter/dictionary/khmerdict.toml", - "segmenter/dictionary/laodict.toml", - "segmenter/dictionary/thaidict.toml" -]; - const LSTM_GLOB: &[&str] = &[ "Burmese_codepoints_exclusive_model4_heavy/weights.json", "Khmer_codepoints_exclusive_model4_heavy/weights.json", diff --git a/tools/testdata-scripts/src/bin/download-repo-sources.rs b/tools/testdata-scripts/src/bin/download-repo-sources.rs index 16565bbae01..9485560c6a9 100644 --- a/tools/testdata-scripts/src/bin/download-repo-sources.rs +++ b/tools/testdata-scripts/src/bin/download-repo-sources.rs @@ -4,7 +4,7 @@ use clap::{ArgAction, Parser}; use eyre::WrapErr; -use icu_datagen::SourceData; +use icu_datagen::DatagenProvider; use icu_locid::*; use icu_provider::DataError; use simple_logger::SimpleLogger; @@ -122,8 +122,8 @@ fn main() -> eyre::Result<()> { extract( cached(&format!( "https://github.com/unicode-org/cldr-json/releases/download/{}/cldr-{}-json-full.zip", - SourceData::LATEST_TESTED_CLDR_TAG, - SourceData::LATEST_TESTED_CLDR_TAG + DatagenProvider::LATEST_TESTED_CLDR_TAG, + DatagenProvider::LATEST_TESTED_CLDR_TAG )) .with_context(|| "Failed to download CLDR ZIP".to_owned())?, expand_paths(CLDR_JSON_GLOB, false), @@ -134,39 +134,23 @@ fn main() -> eyre::Result<()> { extract( cached(&format!( "https://github.com/unicode-org/icu/releases/download/{}/icuexportdata_{}.zip", - SourceData::LATEST_TESTED_ICUEXPORT_TAG, - SourceData::LATEST_TESTED_ICUEXPORT_TAG.replace('/', "-") + DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG, + DatagenProvider::LATEST_TESTED_ICUEXPORT_TAG.replace('/', "-") )) .with_context(|| "Failed to download ICU ZIP".to_owned())?, expand_paths(ICUEXPORTDATA_GLOB, true), out_root.join("tests/data/icuexport"), )?; - std::fs::remove_dir_all(out_root.join("data/segmenter/dictionary"))?; - extract( - cached(&format!( - "https://github.com/unicode-org/icu/releases/download/{}/icuexportdata_{}.zip", - SourceData::LATEST_TESTED_ICUEXPORT_TAG, - SourceData::LATEST_TESTED_ICUEXPORT_TAG.replace('/', "-") - )) - .with_context(|| "Failed to download ICU ZIP".to_owned())?, - ICUEXPORTDATA_SEGMENTER_GLOB - .iter() - .copied() - .map(String::from) - .collect(), - out_root.join("data"), - )?; - - std::fs::remove_dir_all(out_root.join("data/lstm"))?; + std::fs::remove_dir_all(out_root.join("tests/data/lstm"))?; extract( cached(&format!( "https://github.com/unicode-org/lstm_word_segmentation/releases/download/{}/models.zip", - SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG, + DatagenProvider::LATEST_TESTED_SEGMENTER_LSTM_TAG, )) .with_context(|| "Failed to download LSTM ZIP".to_owned())?, LSTM_GLOB.iter().copied().map(String::from).collect(), - out_root.join("data/lstm"), + out_root.join("tests/data/lstm"), )?; Ok(()) diff --git a/tools/testdata-scripts/src/bin/make-testdata-legacy.rs b/tools/testdata-scripts/src/bin/make-testdata-legacy.rs index 1735cf91d97..eb97bfbf109 100644 --- a/tools/testdata-scripts/src/bin/make-testdata-legacy.rs +++ b/tools/testdata-scripts/src/bin/make-testdata-legacy.rs @@ -25,7 +25,7 @@ fn main() { std::fs::create_dir_all(data_root).unwrap(); - let source = SourceData::offline() + let source = SourceData::default() .with_cldr_latest(Default::default()) .unwrap() .with_icuexport_latest() @@ -73,7 +73,6 @@ fn main() { let locales = databake::Bake::bake(LOCALES, &Default::default()); let cldr_tag = SourceData::LATEST_TESTED_CLDR_TAG; let icu_tag = SourceData::LATEST_TESTED_ICUEXPORT_TAG; - let lstm_tag = SourceData::LATEST_TESTED_SEGMENTER_LSTM_TAG; metadata .write_all( @@ -81,7 +80,6 @@ fn main() { pub const LOCALES: &[icu_locid::LanguageIdentifier] = &#locales; pub const CLDR_TAG: &str = #cldr_tag; pub const ICUEXPORT_TAG: &str = #icu_tag; - pub const SEGMENTER_LSTM_TAG: &str = #lstm_tag; } .to_string() .as_bytes(),