Skip to content

Commit

Permalink
NLP-ENGINE-413 String parsing problem for dictionary files
Browse files Browse the repository at this point in the history
Signed-off-by: David de Hilster <david.dehilster@lexisnexisrisk.com>
  • Loading branch information
dehilsterlexis committed Dec 8, 2023
1 parent 16da29d commit d1d90d9
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 55 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.0.0)
cmake_minimum_required(VERSION 3.5.0)
project(nlp VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 17)

Expand Down
110 changes: 56 additions & 54 deletions cs/libconsh/cg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3567,42 +3567,7 @@ bool CG::readDict(std::string file) {
// get tokens
while (c) {
doNext = true;
if (c == '#') {
comment = true;
break;
}
else if (c == '\\') {
backSlash = true;
}
else if (unicu::isWhiteSpace(c)) {
if (inWord) {
lens[tokint] = e - begins[tokint] - 1;
tokint++;
inWord = false;
}
start = e - 1;
lastWhite = true;
}
else if (unicu::isSingle(c)) {
if (unicu::isEmoji(c)) {
int32_t eSave = e;
U8_NEXT(line, e, length, c);
if (unicu::isEmojiVariation(c) || unicu::isEmojiJoiner(c)) {
bool joining = false;
while (c && (unicu::isEmojiVariation(c) || unicu::isEmojiJoiner(c) || joining)) {
joining = unicu::isEmojiJoiner(c);
eSave = e;
U8_NEXT(line, e, length, c);
}
}
e = eSave;
doNext = false;
}
begins[tokint] = firstTime ? 0 : eLast;
lens[tokint] = e - eLast;
tokint++;
}
else if (c == '"') {
if (c == '"') {
if (backSlash && !inString) {
begins[tokint] = e - 2;
lens[tokint++] = 2;
Expand All @@ -3620,27 +3585,64 @@ bool CG::readDict(std::string file) {
}
backSlash = false;
}
else if (c != '_' && (unicu::isPunct(c) || c == '=')) {
if (inWord) {
lens[tokint] = e - begins[tokint] - 1;
else if (!inString) {
if (c == '#') {
comment = true;
break;
}
else if (c == '\\') {
backSlash = true;
}
else if (unicu::isWhiteSpace(c)) {
if (inWord) {
lens[tokint] = e - begins[tokint] - 1;
tokint++;
inWord = false;
}
start = e - 1;
lastWhite = true;
}
else if (unicu::isSingle(c)) {
if (unicu::isEmoji(c)) {
int32_t eSave = e;
U8_NEXT(line, e, length, c);
if (unicu::isEmojiVariation(c) || unicu::isEmojiJoiner(c)) {
bool joining = false;
while (c && (unicu::isEmojiVariation(c) || unicu::isEmojiJoiner(c) || joining)) {
joining = unicu::isEmojiJoiner(c);
eSave = e;
U8_NEXT(line, e, length, c);
}
}
e = eSave;
doNext = false;
}
begins[tokint] = firstTime ? 0 : eLast;
lens[tokint] = e - eLast;
tokint++;
}
if (c == '=' && eqSign == -1 && !backSlash) {
eqSign = tokint;
else if (c != '_' && (unicu::isPunct(c) || c == '=')) {
if (inWord) {
lens[tokint] = e - begins[tokint] - 1;
tokint++;
}
if (c == '=' && eqSign == -1 && !backSlash) {
eqSign = tokint;
}
begins[tokint] = backSlash ? e - 2 : e - 1;
lens[tokint++] = backSlash ? 2 : 1;
inWord = false;
lastWhite = false;
backSlash = false;
}
begins[tokint] = backSlash ? e - 2 : e - 1;
lens[tokint++] = backSlash ? 2 : 1;
inWord = false;
lastWhite = false;
backSlash = false;
}
else if (!inString) {
if (!inWord) {
inWord = true;
begins[tokint] = firstTime ? 0 : e - 1;
else {
if (!inWord) {
inWord = true;
begins[tokint] = firstTime ? 0 : e - 1;
}
lastWhite = false;
backSlash = false;
}
lastWhite = false;
backSlash = false;
}
eLast = e;
if (doNext)
Expand Down Expand Up @@ -4160,4 +4162,4 @@ if (!con)
return (CONCEPT *) kbm_->dict_next((CON*)con);
}

/********************** END OF FILE ***************************/
/********************** END OF FILE ***************************/

0 comments on commit d1d90d9

Please sign in to comment.