Skip to content

Commit

Permalink
Normalise Unicode characters
Browse files Browse the repository at this point in the history
Convert hyphen and space variants to standard ASCII.
  • Loading branch information
mrbrianevans committed Oct 29, 2024
1 parent b9ce261 commit 38a044d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 4 deletions.
7 changes: 5 additions & 2 deletions build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {parse} from 'yaml'
import {pascalCase} from 'change-case'
import {mkdir, rm, writeFile} from 'node:fs/promises'
import {basename} from 'node:path'
import {normalizeText} from "./normaliseText.ts";

// string transform to apply to the name of each constant mapping
const transform = pascalCase
Expand All @@ -15,7 +16,7 @@ const subdirs = []

for (const file of files) {
const exportName = transform(basename(file, '.yml'))
const fileUrl = `https://raw.githubusercontent.com/companieshouse/api-enumerations/master/${file}`
const fileUrl = `https://raw.githubusercontent.com/companieshouse/api-enumerations/refs/heads/master/${file}`

const contents = await fetch(fileUrl).then(res => res.text())
const constants: Record<string, Record<string, string>> = parse(contents)
Expand All @@ -25,7 +26,9 @@ for (const file of files) {
for (const [snake_name, mapping] of Object.entries(constants)) {
const name = transform(snake_name)
const filePath = `build/${exportName}/${name}`
const fileContent = `export const ${name} = ${JSON.stringify(mapping, null, 2)}`
const normalisedMapping = Object.fromEntries(Object.entries(mapping)
.map(([key, value]) => [key, normalizeText(value)]))
const fileContent = `export const ${name} = ${JSON.stringify(normalisedMapping, null, 2)}`
await writeFile(filePath + '.js', fileContent)

const tsContent = Object.keys(mapping).length < 50 ? `export declare const ${name}: {
Expand Down
26 changes: 26 additions & 0 deletions normaliseText.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Generated by Claude 3.5 Sonnet
// Replaces unusual Unicode characters with the standard ASCII equivalent
export const normalizeText = (text: string): string => {
return text
// Normalize all kinds of spaces to regular space
// Includes NBSP, thin space, zero-width space, etc.
.replace(/[\u00A0\u2000-\u200B\u202F\u205F\uFEFF]/g, ' ')

// Normalize various dash/hyphen characters to standard ASCII hyphen
// Includes en dash, em dash, horizontal bar, etc.
.replace(/[\u2010-\u2015\u2212\u2E3A\u2E3B]/g, '-')

// Normalize different types of apostrophes and quotes to standard straight quote
// Includes curly quotes, prime marks, etc.
.replace(/[\u2018\u2019\u201B\u2032\u2035]/g, "'")
.replace(/[\u201C\u201D\u201F\u2033\u2036]/g, '"')

// Remove zero-width joiners and non-joiners
.replace(/[\u200C\u200D]/g, '')

// Collapse multiple spaces into single space
.replace(/\s+/g, ' ')

// Trim leading/trailing whitespace
.trim();
};
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"dependencies": {},
"repository": {
"type": "git",
"url": "https://github.com/OptimalCompliance/ch-constants.git"
}
"url": "git+https://github.com/OptimalCompliance/ch-constants.git"
},
"keywords": ["companies-house", "sic-codes"]
}

0 comments on commit 38a044d

Please sign in to comment.