From bdc74c22c6d1f81dc21cd159e4cce9b086187d09 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Wed, 6 Oct 2021 15:09:57 +0200 Subject: [PATCH] dev: Support Legacy dictionary type Each line in the file will be split based upon simple code splitting rules. --- cspell.schema.json | 24 +++++++++++++++++ packages/cspell-lib/samples/words.txt | 1 + .../DictionaryLoader.test.ts | 8 ++++++ .../SpellingDictionary/DictionaryLoader.ts | 26 ++++++++++++++----- packages/cspell-types/cspell.schema.json | 24 +++++++++++++++++ .../cspell-types/src/CSpellSettingsDef.ts | 10 +++++++ 6 files changed, 87 insertions(+), 6 deletions(-) diff --git a/cspell.schema.json b/cspell.schema.json index 2c3c8e9eaf9..ba3d94be923 100644 --- a/cspell.schema.json +++ b/cspell.schema.json @@ -55,6 +55,11 @@ "$ref": "#/definitions/ReplaceMap", "description": "Replacement pairs" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -108,6 +113,11 @@ ], "description": "Defines the scope for when words will be added to the dictionary. Scope values: `user`, `workspace`, `folder`" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -143,6 +153,11 @@ "$ref": "#/definitions/ReplaceMap", "description": "Replacement pairs" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -154,6 +169,15 @@ ], "type": "object" }, + "DictionaryFileTypes": { + "enum": [ + "S", + "W", + "C", + "T" + ], + "type": "string" + }, "DictionaryId": { "description": "This is the name of a dictionary.\n\nName Format:\n- Must contain at least 1 number or letter.\n- spaces are allowed.\n- Leading and trailing space will be removed.\n- Names ARE case-sensitive\n- Must not contain `*`, `!`, `;`, `,`, `{`, `}`, `[`, `]`, `~`", "pattern": "^(?=[^!*,;{}[\\]~\\n]+$)(?=(.*\\w)).+$", diff --git a/packages/cspell-lib/samples/words.txt b/packages/cspell-lib/samples/words.txt index f8e2dfb8428..6ea7aa5b9c9 100644 --- a/packages/cspell-lib/samples/words.txt +++ b/packages/cspell-lib/samples/words.txt @@ -7,3 +7,4 @@ cherry left-right Geschäft aujourd’hui +class:name diff --git a/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.test.ts b/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.test.ts index a9e8ceb6745..ddb7d70ccd7 100644 --- a/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.test.ts +++ b/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.test.ts @@ -70,13 +70,21 @@ describe('Validate DictionaryLoader', () => { function nfc(s: string): string { return s.normalize('NFC'); } + // cspell:ignore aujourd’hui const csharp = require.resolve('@cspell/dict-csharp/csharp.txt.gz'); test.each` testCase | file | options | word | maxAge | hasWord | hasErrors ${'sample words'} | ${sample('words.txt')} | ${{}} | ${'apple'} | ${1} | ${true} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{}} | ${'class:name'} | ${1} | ${true} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{}} | ${'left-right'} | ${1} | ${true} | ${false} ${'sample words'} | ${sample('words.txt')} | ${{ type: 5 }} | ${'apple'} | ${1} | ${true} | ${false} ${'sample words'} | ${sample('words.txt')} | ${{ type: 'S' }} | ${'pear'} | ${undefined} | ${true} | ${false} ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'strawberry'} | ${1} | ${true} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'left-right'} | ${1} | ${false} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'left'} | ${1} | ${true} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'class:name'} | ${1} | ${false} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'name'} | ${1} | ${true} | ${false} + ${'sample words'} | ${sample('words.txt')} | ${{ type: 'C' }} | ${'aujourd’hui'} | ${1} | ${true} | ${false} ${'sample words'} | ${sample('words.txt')} | ${{}} | ${'tree'} | ${1} | ${false} | ${false} ${'unknown loader'} | ${sample('words.txt')} | ${{ type: 5 }} | ${'apple'} | ${1} | ${true} | ${false} ${'sample words'} | ${sample('words.txt')} | ${{}} | ${'left-right'} | ${1} | ${true} | ${false} diff --git a/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.ts b/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.ts index 4b0b9c79b7c..98c93dec333 100644 --- a/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.ts +++ b/packages/cspell-lib/src/SpellingDictionary/DictionaryLoader.ts @@ -1,4 +1,4 @@ -import type { DictionaryDefinitionPreferred } from '@cspell/cspell-types'; +import type { DictionaryDefinitionPreferred, DictionaryFileTypes } from '@cspell/cspell-types'; import { stat } from 'fs-extra'; import * as path from 'path'; import { readLines } from '../util/fileReader'; @@ -6,12 +6,13 @@ import { createFailedToLoadDictionary, createSpellingDictionary } from './create import { SpellingDictionary } from './SpellingDictionary'; import { SpellingDictionaryLoadError } from './SpellingDictionaryError'; import { createSpellingDictionaryTrie } from './SpellingDictionaryFromTrie'; +import { genSequence } from 'gensequence'; const MAX_AGE = 10000; const loaders: Loaders = { S: loadSimpleWordList, - C: loadSimpleWordList, + C: legacyWordList, T: loadTrie, default: loadSimpleWordList, }; @@ -52,7 +53,7 @@ export function loadDictionary(uri: string, options: DictionaryDefinitionPreferr const importantOptionKeys: (keyof DictionaryDefinitionPreferred)[] = ['noSuggest', 'useCompounds']; function calcKey(uri: string, options: DictionaryDefinitionPreferred) { - const loaderType = determineType(uri); + const loaderType = determineType(uri, options); const optValues = importantOptionKeys.map((k) => options[k]?.toString() || ''); const parts = [uri, loaderType].concat(optValues); @@ -107,18 +108,31 @@ function loadEntry(uri: string, options: LoadOptions, now = Date.now()): CacheEn }; } -function determineType(uri: string): LoaderType { - const defType = uri.endsWith('.trie.gz') ? 'T' : uri.endsWith('.txt.gz') ? 'S' : 'S'; +function determineType(uri: string, opts: Pick): LoaderType { + const t: DictionaryFileTypes = (opts.type && opts.type in loaders && opts.type) || 'S'; + const defLoaderType = t as LoaderType; + const defType = uri.endsWith('.trie.gz') ? 'T' : uri.endsWith('.txt.gz') ? defLoaderType : defLoaderType; const regTrieTest = /\.trie\b/i; return regTrieTest.test(uri) ? 'T' : defType; } function load(uri: string, options: LoadOptions): Promise { - const type = determineType(uri); + const type = determineType(uri, options); const loader = loaders[type] || loaders.default; return loader(uri, options); } +async function legacyWordList(filename: string, options: LoadOptions) { + const lines = await readLines(filename); + const words = genSequence(lines) + // Remove comments + .map((line) => line.replace(/#.*/g, '')) + // Split on everything else + .concatMap((line) => line.split(/[^\w\p{L}\p{M}'’]+/gu)) + .filter((word) => !!word); + return createSpellingDictionary(words, determineName(filename, options), filename, options); +} + async function loadSimpleWordList(filename: string, options: LoadOptions) { const lines = await readLines(filename); return createSpellingDictionary(lines, determineName(filename, options), filename, options); diff --git a/packages/cspell-types/cspell.schema.json b/packages/cspell-types/cspell.schema.json index 2c3c8e9eaf9..ba3d94be923 100644 --- a/packages/cspell-types/cspell.schema.json +++ b/packages/cspell-types/cspell.schema.json @@ -55,6 +55,11 @@ "$ref": "#/definitions/ReplaceMap", "description": "Replacement pairs" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -108,6 +113,11 @@ ], "description": "Defines the scope for when words will be added to the dictionary. Scope values: `user`, `workspace`, `folder`" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -143,6 +153,11 @@ "$ref": "#/definitions/ReplaceMap", "description": "Replacement pairs" }, + "type": { + "$ref": "#/definitions/DictionaryFileTypes", + "default": "S", + "description": "Type of file: S - single word per line, W - each line can contain one or more words separated by space, C - each line is treated like code (Camel Case is allowed) Default is S C is the slowest to load due to the need to split each line based upon code splitting rules." + }, "useCompounds": { "description": "Use Compounds", "type": "boolean" @@ -154,6 +169,15 @@ ], "type": "object" }, + "DictionaryFileTypes": { + "enum": [ + "S", + "W", + "C", + "T" + ], + "type": "string" + }, "DictionaryId": { "description": "This is the name of a dictionary.\n\nName Format:\n- Must contain at least 1 number or letter.\n- spaces are allowed.\n- Leading and trailing space will be removed.\n- Names ARE case-sensitive\n- Must not contain `*`, `!`, `;`, `,`, `{`, `}`, `[`, `]`, `~`", "pattern": "^(?=[^!*,;{}[\\]~\\n]+$)(?=(.*\\w)).+$", diff --git a/packages/cspell-types/src/CSpellSettingsDef.ts b/packages/cspell-types/src/CSpellSettingsDef.ts index a78888a2282..dbe8753d42d 100644 --- a/packages/cspell-types/src/CSpellSettingsDef.ts +++ b/packages/cspell-types/src/CSpellSettingsDef.ts @@ -380,6 +380,16 @@ export interface DictionaryDefinitionBase { * possible suggestions. */ noSuggest?: boolean; + /** + * Type of file: + * S - single word per line, + * W - each line can contain one or more words separated by space, + * C - each line is treated like code (Camel Case is allowed) + * Default is S + * C is the slowest to load due to the need to split each line based upon code splitting rules. + * @default "S" + */ + type?: DictionaryFileTypes; } export interface DictionaryDefinitionPreferred extends DictionaryDefinitionBase {