Skip to content

Commit

Permalink
fix: Fix issue with missing suggestions (#1339)
Browse files Browse the repository at this point in the history
* fix: Fix an issue with suggestions getting lost
  Some suggestions would get lost especially when the first character did not match.
* fix: make sure accents are preserved when adding words to dictionary
* fix: preserve legacy behavior on non-case sensitive dictionaries
* dev: update django snapshot
  • Loading branch information
Jason3S authored Jun 13, 2021
1 parent a948808 commit cfecde8
Show file tree
Hide file tree
Showing 19 changed files with 481 additions and 221 deletions.
38 changes: 38 additions & 0 deletions cspell.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,44 @@
"path": "./packages/cspell"
}
],
"launch": {
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Test: Jest current-file",
"program": "${fileWorkspaceFolder}/node_modules/.bin/jest",
"cwd": "${fileDirname}",
"args": [
"--runInBand",
"${fileBasename}"
],
"console": "integratedTerminal",
"internalConsoleOptions": "neverOpen",
"disableOptimisticBPs": true,
"windows": {
"program": "${fileWorkspaceFolder}/node_modules/jest/bin/jest",
}
},
{
"type": "node",
"request": "launch",
"name": "Test: Jest Entire Folder",
"program": "${fileWorkspaceFolder}/node_modules/.bin/jest",
"cwd": "${fileWorkspaceFolder}",
"args": [
"--runInBand"
],
"console": "integratedTerminal",
"internalConsoleOptions": "neverOpen",
"disableOptimisticBPs": true,
"windows": {
"program": "${fileWorkspaceFolder}/node_modules/jest/bin/jest",
}
}
],
"compounds": []
},
"settings": {
"typescript.tsdk": "cspell-monorepo/node_modules/typescript/lib",
"cSpell.customWorkspaceDictionaries": [
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
{
"path": "django/django",
"url": "https://github.com/django/django.git",
"commit": "8b4983cfd429e17c8092f4ff775915327effa6fa",
"commit": "316cc34d046ad86e100227772294f906fae1c2e5",
"args": [
"**/*.{md,py}"
]
Expand Down
15 changes: 8 additions & 7 deletions integration-tests/snapshots/django/django/snapshot.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Repository: django/django
Url: "https://github.com/django/django.git"
Args: ["**/*.{md,py}"]
Lines:
CSpell: Files checked: 2128, Issues found: 7885 in 906 files
CSpell: Files checked: 2128, Issues found: 7886 in 907 files
exit code: 1
./django/apps/config.py:128:76 - Unknown word (isclass) -- getmembers(mod, inspect.isclass)
./django/apps/registry.py:132:40 - Unknown word (unconfigured) -- not ready" is due to unconfigured settings, accessing
Expand Down Expand Up @@ -440,7 +440,7 @@ Lines:
./django/core/management/commands/shell.py:23:97 - Unknown word (pythonrc) -- environment variable and ~/.pythonrc.py script.',
./django/core/management/commands/shell.py:48:32 - Unknown word (rlcompleter) -- try: # Try activating rlcompleter, because it's handy
./django/core/management/commands/shell.py:75:45 - Unknown word (cpython) -- the behavior of the cpython shell where an error
./django/core/management/commands/showmigrations.py:130:19 - Unknown word (deps) -- def print_deps(node):
./django/core/management/commands/showmigrations.py:137:19 - Unknown word (deps) -- def print_deps(node):
./django/core/management/commands/squashmigrations.py:118:13 - Unknown word (smigration) -- for smigration in migrations_to_squash
./django/core/management/templates.py:258:28 - Unknown word (mkdtemp) -- tempdir = tempfile.mkdtemp(prefix=prefix, suffix
./django/core/management/templates.py:339:38 - Unknown word (IMODE) -- permissions = stat.S_IMODE(st.st_mode) | stat.S
Expand Down Expand Up @@ -489,7 +489,7 @@ Lines:
./django/db/backends/base/operations.py:459:18 - Unknown word (autopk) -- def validate_autopk_value(self, value):
./django/db/backends/base/operations.py:522:15 - Unknown word (ipaddressfield) -- def adapt_ipaddressfield_value(self, value):
./django/db/backends/base/schema.py:158:16 - Unknown word (sqls) -- column_sqls = []
./django/db/backends/base/schema.py:807:9 - Unknown word (rels) -- rels_to_update = []
./django/db/backends/base/schema.py:814:9 - Unknown word (rels) -- rels_to_update = []
./django/db/backends/mysql/base.py:196:14 - Unknown word (conv) -- 'conv': django_conversions
./django/db/backends/mysql/base.py:4:10 - Unknown word (mysqlclient) -- Requires mysqlclient: https://pypi.org/project
./django/db/backends/mysql/base.py:55:29 - Unknown word (reraises) -- exception instances and reraises them with the correct
Expand Down Expand Up @@ -1466,6 +1466,7 @@ Lines:
./tests/humanize_tests/tests.py:346:24 - Unknown word (lety) -- 'před 2\xa0lety, 4\xa0měsíci',
./tests/humanize_tests/tests.py:352:22 - Unknown word (roky) -- 'za 2\xa0roky, 4\xa0měsíce',
./tests/i18n/commands/__init__.py:12:49 - Unknown word (Foos) -- s Foo", "%(number)s Foos", number) % {'number
./tests/i18n/contenttypes/tests.py:25:57 - Unknown word (Société) -- company_type), 'i18n | Société')
./tests/i18n/patterns/tests.py:134:65 - Unknown word (vertaald) -- prefix-translated'), '/vertaald/')
./tests/i18n/patterns/tests.py:146:53 - Unknown word (gebruikers) -- reverse('users'), '/nl/gebruikers/')
./tests/i18n/patterns/tests.py:157:81 - Unknown word (profiel) -- register/', 'nl'), '/nl/profiel/registreren/')
Expand Down Expand Up @@ -1823,16 +1824,16 @@ Lines:
./tests/responses/tests.py:140:19 - Unknown word (textiowrapper) -- def test_wrap_textiowrapper(self):
./tests/responses/tests.py:28:38 - Unknown word (qwer) -- writelines(['asdf\n', 'qwer\n'])
./tests/responses/tests.py:76:34 - Unknown word (J'attendrai) -- HttpResponse(status="J'attendrai")
./tests/runtests.py:129:9 - Unknown word (subdirs) -- subdirs_to_skip = SUBDIRS_TO
./tests/runtests.py:301:40 - Unknown word (finalizer) -- multiprocessing.util finalizer that tries to remove
./tests/runtests.py:130:9 - Unknown word (subdirs) -- subdirs_to_skip = SUBDIRS_TO
./tests/runtests.py:319:40 - Unknown word (finalizer) -- multiprocessing.util finalizer that tries to remove
./tests/runtests.py:3:8 - Unknown word (atexit) -- import atexit
./tests/runtests.py:80:1 - Unknown word (SUBDIRS) -- SUBDIRS_TO_SKIP = {
./tests/runtests.py:81:1 - Unknown word (SUBDIRS) -- SUBDIRS_TO_SKIP = {
./tests/schema/models.py:156:21 - Unknown word (INTEGERPK) -- db_table = "INTEGERPK" # uppercase to ensure
./tests/schema/tests.py:1596:35 - Unknown word (tagm) -- assertEqual(columns['tagm2mtest_id'][0], connection
./tests/schema/tests.py:1596:40 - Unknown word (mtest) -- assertEqual(columns['tagm2mtest_id'][0], connection
./tests/schema/tests.py:246:48 - Unknown word (repointing) -- out of FK order, then repointing, works"
./tests/schema/tests.py:3176:81 - Unknown word (upcase) -- and, for Oracle, un-upcase
./tests/schema/tests.py:3721:27 - Unknown word (dtob) -- self.assertNotIn("dtob_auto_now", columns)
./tests/schema/tests.py:3748:27 - Unknown word (dtob) -- self.assertNotIn("dtob_auto_now", columns)
./tests/select_for_update/tests.py:142:44 - Unknown word (eucountry) -- ['select_for_update_eucountry"."country_ptr_id']
./tests/select_for_update/tests.py:170:36 - Unknown word (eucity) -- 'select_for_update_eucity"."id',
./tests/select_for_update/tests.py:323:14 - Unknown word (unsuported) -- def test_unsuported_no_key_raises_error
Expand Down
37 changes: 37 additions & 0 deletions packages/cspell-lib/docs/accents.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Case sensitivity and Accents

## Problem

Many programming languages have their own casing conventions. Things like `camelCase` or Title case for ClassNames might run counter to the proper casing of words.
For example, `english` in the variable name `list_of_english_words` would be considered incorrect because it is a proper name.

The other challenge is accents. Many programming languages do not allow accented character as variable or class names.

## Approach

Beginning with cspell version 5, the spell checker allows for case sensitive checking. This is achieved by having two effective dictionaries with every word list.

Let take a look at how the German word for business, `Geschäft`, is stored.

There will be three forms: `Geschäft`, `~geschäft`, and `~geschaft`. Words prefixed with `~` that is is a case insensitive version of the word.

Result when spell checking various forms of `Geschäft`:

| word | cs<sup>1</sup> | non<sup>2</sup> |
| ---------- | -------------- | --------------- |
| `Geschäft` |||
| `GESCHÄFT` |||
| `Geschaft` |||
| `geschäft` |||
| `geschaft` |||
| `GESCHAFT` |||
| `gescháft` |||
| `Gescháft` |||

<sup>1</sup>cs - case sensitive

<sup>2</sup>non - not case sensitive

Note: If accents are present in a word, they must match the accents in the original word.

<!--- cspell:words Geschäft gescháft --->
171 changes: 80 additions & 91 deletions packages/cspell-lib/src/SpellingDictionary/SpellingDictionary.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { __testMethods } from './SpellingDictionaryMethods';
import { createSpellingDictionary } from './createSpellingDictionary';
import { SpellingDictionaryFromTrie } from './SpellingDictionaryFromTrie';
import { Trie } from 'cspell-trie-lib';
import { FunctionArgs } from '../util/types';

// cSpell:ignore aple

Expand Down Expand Up @@ -92,8 +91,9 @@ describe('Verify building Dictionary', () => {
const trie = Trie.create(words);
const dict = new SpellingDictionaryFromTrie(trie, 'trie');
// cspell:ignore cattles
const suggestions = dict.suggest('Cattles').map(({ word }) => word);
expect(suggestions[0]).toBe('cattle');
const results = dict.suggest('Cattles');
const suggestions = results.map(({ word }) => word);
expect(suggestions).toEqual(['cattle', 'battles', 'rattles', 'tattles', 'battle', 'rattle']);
expect(suggestions).toEqual(expect.not.arrayContaining(['banana']));
});

Expand All @@ -111,107 +111,96 @@ describe('Verify building Dictionary', () => {
expect(suggestions).toEqual(expect.not.arrayContaining(['banana']));
});

test('wordDictionaryFormsCollector', () => {
function testCase(word: string, isCaseSensitive: boolean, expected: string[]) {
const collector = __testMethods.wordDictionaryFormsCollector(isCaseSensitive);
type Test = [string, boolean, string[]];
// cspell:ignore café
const tests: Test[] = [
['house', false, ['house']],
['House', false, ['House', 'house']],
['café', false, ['cafe', 'café']],
['Café', false, ['Cafe', 'Café', 'cafe', 'café']],
['House', true, ['House', '~house']],
['HOUSE', true, ['HOUSE', '~house']],
['Café', true, ['Café', '~Cafe', '~cafe', '~café']],
// Make sure all accent forms work.
['café'.normalize(), false, ['cafe', 'café']],
['café'.normalize('NFD'), false, ['cafe', 'café']],
['café'.normalize('NFKC'), false, ['cafe', 'café']],
['café'.normalize('NFKD'), false, ['cafe', 'café']],
];
test.each(tests)(
'wordDictionaryFormsCollector %s %o %o',
(word: string, isCaseSensitive: boolean, expected: string[]) => {
const collector = __testMethods.wordDictionaryFormsCollector(isCaseSensitive ? '~' : '');
expect([...collector(word)].sort()).toEqual(expected.sort());
expect([...collector(word)]).toEqual([]);
}
type Test = [string, boolean, string[]];
// cspell:ignore café
const tests: Test[] = [
['house', false, ['house']],
['House', false, ['House', 'house']],
['café', false, ['cafe', 'café']],
['Café', false, ['Cafe', 'Café', 'cafe', 'café']],
['House', true, ['House', '>house']],
['HOUSE', true, ['HOUSE', '>house']],
['Café', true, ['Café', '>Cafe', '>cafe', '>café']],
// Make sure all accent forms work.
['café'.normalize(), false, ['cafe', 'café']],
['café'.normalize('NFD'), false, ['cafe', 'café']],
['café'.normalize('NFKC'), false, ['cafe', 'café']],
['café'.normalize('NFKD'), false, ['cafe', 'café']],
];
tests.forEach((t) => testCase(...t));
});
);
});

describe('Validate wordSearchForms', () => {
function testCase(word: string, isCaseSensitive: boolean, ignoreCase: boolean, expected: string[]) {
test(`${word} ${isCaseSensitive} ${ignoreCase} ${expected}`, () => {
const words = __testMethods.wordSearchFormsArray(word, isCaseSensitive, ignoreCase);
expect(words.sort()).toEqual(expected.sort());
});
}
type TestCase = FunctionArgs<typeof testCase>;
// cspell:ignore café
const tests: TestCase[] = [
// word, dic is case sensitive, ignoreCase on lookup, expected
['house', false, false, ['house']],
['House', false, false, ['house']],
['House', false, false, ['house']],
['House', true, false, ['House', 'house']],
['HOUSE', false, false, ['house']],
['HOUSE', true, false, ['HOUSE', 'House', 'house']],
['café', false, false, ['café']],
['café', true, false, ['café']],
['café', true, true, ['cafe']],
['Café', false, false, ['café']],
['Café', false, true, ['cafe', 'café']],
['Café', true, false, ['Café', 'café']],
['Café', true, true, ['cafe']],
['CAFÉ', false, false, ['café']],
['CAFÉ', false, true, ['cafe', 'café']],
['CAFÉ', true, false, ['CAFÉ', 'Café', 'café']],
['CAFÉ', true, true, ['cafe']],
// Make sure all accent forms work.
['café'.normalize(), false, false, ['café']],
['café'.normalize('NFD'), false, false, ['café']],
['café'.normalize('NFKC'), false, false, ['café']],
['café'.normalize('NFKD'), false, false, ['café']],
];
tests.forEach((t) => testCase(...t));
test.each`
word | isCaseSensitive | ignoreCase | expected
${'house'} | ${false} | ${false} | ${['house']}
${'House'} | ${false} | ${false} | ${['house']}
${'House'} | ${false} | ${false} | ${['house']}
${'House'} | ${true} | ${false} | ${['House', 'house']}
${'HOUSE'} | ${false} | ${false} | ${['house']}
${'HOUSE'} | ${true} | ${false} | ${['HOUSE', 'House', 'house']}
${'café'} | ${false} | ${false} | ${['cafe', 'café']}
${'café'} | ${true} | ${false} | ${['café']}
${'café'} | ${true} | ${true} | ${['café']}
${'Café'} | ${false} | ${false} | ${['cafe', 'café']}
${'Café'} | ${false} | ${true} | ${['cafe', 'café']}
${'Café'} | ${true} | ${false} | ${['Café', 'café']}
${'Café'} | ${true} | ${true} | ${['café']}
${'CAFÉ'} | ${false} | ${false} | ${['cafe', 'café']}
${'CAFÉ'} | ${false} | ${true} | ${['cafe', 'café']}
${'CAFÉ'} | ${true} | ${false} | ${['CAFÉ', 'Café', 'café']}
${'CAFÉ'} | ${true} | ${true} | ${['café']}
${'café'.normalize()} | ${false} | ${false} | ${['cafe', 'café']}
${'café'.normalize('NFD')} | ${false} | ${false} | ${['cafe', 'café']}
${'café'.normalize('NFKC')} | ${false} | ${false} | ${['cafe', 'café']}
${'café'.normalize('NFKD')} | ${false} | ${false} | ${['cafe', 'café']}
`('$word $isCaseSensitive $ignoreCase $expected', ({ word, isCaseSensitive, ignoreCase, expected }) => {
const words = __testMethods.wordSearchFormsArray(word, isCaseSensitive, ignoreCase);
expect(words.sort()).toEqual(expected.sort());
});
});

describe('Verify Case Sensitive Dictionaries', () => {
function testHas(word: string, ignoreCase: boolean | undefined, expected: boolean) {
test(`Has ${word} Case: ${ignoreCase} Exp: ${expected}`, async () => {
const dict = await sampleDict();
expect(dict.has(word, { ignoreCase })).toBe(expected);
});
}

const tests: FunctionArgs<typeof testHas>[] = [
['Paris', undefined, true],
['PARIS', undefined, true],
['paris', undefined, true],
['Paris', true, true],
['PARIS', true, true],
['paris', true, true],
['Paris', false, true],
['PARIS', false, true],
['paris', false, false],
['Köln', false, true],
['köln', false, false],
['KÖLN', false, true],
];
tests.forEach((t) => testHas(...t));

test('Suggestions 1', async () => {
// cspell:ignore koln
const dict = await sampleDict();
const sugs = dict.suggest('kuln'); // cspell:disable-line
const sugWords = sugs.map((s) => s.word);
expect(sugWords).toEqual(['Köln']);
test.each`
word | ignoreCase | expected
${'Paris'} | ${undefined} | ${true}
${'PARIS'} | ${undefined} | ${true}
${'paris'} | ${undefined} | ${true}
${'Paris'} | ${true} | ${true}
${'PARIS'} | ${true} | ${true}
${'paris'} | ${true} | ${true}
${'Paris'} | ${false} | ${true}
${'PARIS'} | ${false} | ${true}
${'paris'} | ${false} | ${false}
${'Köln'} | ${false} | ${true}
${'köln'} | ${false} | ${false}
${'KÖLN'} | ${false} | ${true}
`(`Has $word Case: $ignoreCase Exp: $expected`, ({ word, ignoreCase, expected }) => {
const dict = sampleDict();
expect(dict.has(word, { ignoreCase })).toBe(expected);
});

test('Suggestions 2', async () => {
// cspell:ignore kuln
test.each`
word | ignoreCase | expected
${'köln'} | ${false} | ${['Köln']}
${'köln'} | ${true} | ${['köln', 'koln', 'Köln']}
${'koln'} | ${true} | ${['koln', 'köln', 'Köln']}
${'kuln'} | ${false} | ${['Köln']}
${'kuln'} | ${true} | ${['koln', 'köln', 'Köln']}
`('Suggestions for $word $ignoreCase $expected', ({ word, ignoreCase, expected }) => {
// cspell:ignore koln
const dict = await sampleDict();
const sugs = dict.suggest('kuln', { ignoreCase: false }); // cspell:disable-line
const dict = sampleDict();
const sugs = dict.suggest(word, { ignoreCase });
const sugWords = sugs.map((s) => s.word);
expect(sugWords).toEqual(['Köln']);
expect(sugWords).toEqual(expected);
});
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ export interface SpellingDictionary {
word: string,
numSuggestions?: number,
compoundMethod?: CompoundWordsMethod,
numChanges?: number
numChanges?: number,
ignoreCase?: boolean
): SuggestionResult[];
suggest(word: string, suggestOptions: SuggestOptions): SuggestionResult[];
genSuggestions(collector: SuggestionCollector, suggestOptions: SuggestOptions): void;
Expand Down
Loading

0 comments on commit cfecde8

Please sign in to comment.