-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Convert hyphen and space variants to standard ASCII.
- Loading branch information
1 parent
b9ce261
commit 38a044d
Showing
3 changed files
with
34 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// Generated by Claude 3.5 Sonnet | ||
// Replaces unusual Unicode characters with the standard ASCII equivalent | ||
export const normalizeText = (text: string): string => { | ||
return text | ||
// Normalize all kinds of spaces to regular space | ||
// Includes NBSP, thin space, zero-width space, etc. | ||
.replace(/[\u00A0\u2000-\u200B\u202F\u205F\uFEFF]/g, ' ') | ||
|
||
// Normalize various dash/hyphen characters to standard ASCII hyphen | ||
// Includes en dash, em dash, horizontal bar, etc. | ||
.replace(/[\u2010-\u2015\u2212\u2E3A\u2E3B]/g, '-') | ||
|
||
// Normalize different types of apostrophes and quotes to standard straight quote | ||
// Includes curly quotes, prime marks, etc. | ||
.replace(/[\u2018\u2019\u201B\u2032\u2035]/g, "'") | ||
.replace(/[\u201C\u201D\u201F\u2033\u2036]/g, '"') | ||
|
||
// Remove zero-width joiners and non-joiners | ||
.replace(/[\u200C\u200D]/g, '') | ||
|
||
// Collapse multiple spaces into single space | ||
.replace(/\s+/g, ' ') | ||
|
||
// Trim leading/trailing whitespace | ||
.trim(); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters