Skip to content

Commit

Permalink
ICU-22481 Add toml support to gendict
Browse files Browse the repository at this point in the history
(and use it in CI)
  • Loading branch information
Manishearth authored and sffc committed Dec 28, 2023
1 parent 7eb56fe commit 74abcfe
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 34 deletions.
4 changes: 1 addition & 3 deletions .ci-builds/.azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -744,9 +744,7 @@ jobs:
mkdir -p icuexportdata/segmenter/dictionary
for FILE in `ls data/brkitr/dictionaries`
do
./bin/gendict --uchars data/brkitr/dictionaries/$FILE /dev/stdout | \
python3 -c 'import sys; data = sys.stdin.buffer.read(); print(f"trie_data = {[data[i + 1] << 8 | data[i] for i in range(64, len(data) - 1, 2)]}")' \
> icuexportdata/segmenter/dictionary/`basename $FILE .txt`.toml
./bin/gendict --uchars --toml data/brkitr/dictionaries/$FILE icuexportdata/segmenter/dictionary/`basename $FILE .txt`.toml
done
displayName: 'Build segmenter dictionary files'
env:
Expand Down
89 changes: 58 additions & 31 deletions icu4c/source/tools/gendict/gendict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "ucbuf.h"
#include "toolutil.h"
#include "cstring.h"
#include "writesrc.h"

#include <stdio.h>
#include <stdlib.h>
Expand All @@ -58,7 +59,8 @@ static UOption options[]={
{ "uchars", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 6 */
{ "bytes", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 7 */
{ "transform", nullptr, nullptr, nullptr, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
UOPTION_QUIET, /* 9 */
{ "toml", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 9 */
UOPTION_QUIET, /* 10 */
};

enum arguments {
Expand All @@ -70,6 +72,7 @@ enum arguments {
ARG_UCHARS,
ARG_BYTES,
ARG_TRANSFORM,
ARG_TOML,
ARG_QUIET
};

Expand All @@ -90,7 +93,8 @@ static void usageAndDie(UErrorCode retCode) {
"\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
"\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
"\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
"\t which specifies an offset transform with constant 0x40A3)\n",
"\t which specifies an offset transform with constant 0x40A3)\n"
"\t--toml output the trie in toml format (default is binary),\n",
u_getDataDirectory());
exit(retCode);
}
Expand Down Expand Up @@ -293,6 +297,8 @@ int main(int argc, char **argv) {

IcuToolErrorCode status("gendict/main()");

UBool isToml = options[ARG_TOML].doesOccur;

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
const char* outDir=nullptr;

Expand Down Expand Up @@ -417,38 +423,59 @@ int main(int argc, char **argv) {
exit(status.reset());
}
if (verbose) { puts("Opening output file..."); }
UNewDataMemory *pData = udata_create(nullptr, nullptr, outFileName, &dataInfo, copyright, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
exit(status.reset());
}

if (verbose) { puts("Writing to output file..."); }
int32_t indexes[DictionaryData::IX_COUNT] = {
DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
};
int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
indexes[DictionaryData::IX_TOTAL_SIZE] = size;

indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
if (hasValues) {
indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
}
if (isToml) {
FILE* f = fopen(outFileName, "w");
if (f == nullptr) {
fprintf(stderr, "gendict: could not open output file \"%s\"\n", outFileName);
exit(status.reset());
}
fprintf(f, "trie_type = \"%s\"\n", isBytesTrie ? "bytes" : "uchars");
fprintf(f, "has_values = %s\n", hasValues ? "true" : "false");
int32_t transform = dict.getTransform();
bool isOffset = (transform & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET;
int32_t offset = transform & DictionaryData::TRANSFORM_OFFSET_MASK;
fprintf(f, "transform_type = \"%s\"\n", isOffset ? "offset" : "none");
fprintf(f, "transform_offset = %d\n", offset);

usrc_writeArray(f, "trie_data = [\n ", outData, isBytesTrie ? 8 : 16, outDataSize, " ", "\n]\n");

indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, outData, outDataSize);
size_t bytesWritten = udata_finish(pData, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
exit(status.reset());
}

if (bytesWritten != (size_t)size) {
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
exit(U_INTERNAL_PROGRAM_ERROR);
fclose(f);
} else {
UNewDataMemory *pData = udata_create(nullptr, nullptr, outFileName, &dataInfo, copyright, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
exit(status.reset());
}

if (verbose) { puts("Writing to output file..."); }
int32_t indexes[DictionaryData::IX_COUNT] = {
DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
};
int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
indexes[DictionaryData::IX_TOTAL_SIZE] = size;

indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
if (hasValues) {
indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
}

indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, outData, outDataSize);
size_t bytesWritten = udata_finish(pData, status);
if (status.isFailure()) {
fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
exit(status.reset());
}

if (bytesWritten != (size_t)size) {
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}

if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); }
Expand Down

0 comments on commit 74abcfe

Please sign in to comment.