Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add spellchecking #51

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def __init__(self, lang_code: str, ids: Set[str]):
lt_code, remote_server="http://localhost:8081/"
)
self.tool.disabled_rules = {
"MORFOLOGIK_RULE_EN_US",
"GERMAN_SPELLER_RULE",
"COMMA_PARENTHESIS_WHITESPACE",
"DOUBLE_PUNCTUATION",
Expand Down Expand Up @@ -117,6 +116,7 @@ class NLPRule:
def __init__(self, lang_code: str):
self.tokenizer = nlprule.Tokenizer(f"storage/{lang_code}_tokenizer.bin")
self.rules = nlprule.Rules(f"storage/{lang_code}_rules.bin", self.tokenizer)
self.rules.spell.options.variant = "en_US"

def suggest(self, sentence: str) -> Set[Suggestion]:
suggestions = {
Expand Down
1 change: 1 addition & 0 deletions build/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ python build/make_build_dir.py \
--chunker_token_model=$HOME/Downloads/nlprule/en-token.bin \
--chunker_pos_model=$HOME/Downloads/nlprule/en-pos-maxent.bin \
--chunker_chunk_model=$HOME/Downloads/nlprule/en-chunker.bin \
--spell_map_path=$LT_PATH/org/languagetool/rules/en/contractions.txt \
--out_dir=data/en
```

Expand Down
112 changes: 109 additions & 3 deletions build/make_build_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from zipfile import ZipFile
import lxml.etree as ET
import wordfreq
from glob import glob
from chardet.universaldetector import UniversalDetector

from chunker import write_chunker # type: ignore
Expand Down Expand Up @@ -59,7 +60,7 @@ def copy_lt_files(out_dir, lt_dir, lang_code):
canonicalize(out_dir / xmlfile)


def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path):
def dump_dict(out_path, lt_dir, tag_dict_path, tag_info_path):
# dump dictionary, see https://dev.languagetool.org/developing-a-tagger-dictionary
os.system(
f"java -cp {lt_dir / 'languagetool.jar'} org.languagetool.tools.DictionaryExporter "
Expand All @@ -83,7 +84,46 @@ def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path):
dump_bytes = open(out_path, "rb").read()

with open(out_path, "w") as f:
f.write(dump_bytes.decode(result["encoding"]))
f.write(dump_bytes.decode(result["encoding"] or "utf-8"))


def proc_spelling_text(in_paths, out_path, lang_code):
with open(out_path, "w") as f:
for in_path in in_paths:
if in_path.exists():
for line in open(in_path):
# strip comments
comment_index = line.find("#")
if comment_index != -1:
line = line[:comment_index]

line = line.strip()
if len(line) == 0:
continue

try:
word, suffix = line.split("/")

assert lang_code == "de", "Flags are only supported for German!"

for flag in suffix:
assert flag != "Ä"
if flag == "A" and word.endswith("e"):
flag = "Ä"

f.write(word + "\n")

for ending in {
"S": ["s"],
"N": ["n"],
"E": ["e"],
"F": ["in"],
"A": ["e", "er", "es", "en", "em"],
"Ä": ["r", "s", "n", "m"],
Comment on lines +116 to +122
Copy link
Contributor

@drahnr drahnr Mar 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the language specific affix file instead of hardcoding it, it saves yourself and a few other people a lot of pain :)

}[flag]:
f.write(word + ending + "\n")
except ValueError:
f.write(line + "\n")


if __name__ == "__main__":
Expand Down Expand Up @@ -138,6 +178,12 @@ def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path):
default=None,
help="Path to the OpenNLP chunker binary. See token model message for details.",
)
parser.add_argument(
"--spell_map_path",
default=None,
action="append",
help="Paths to files containing a mapping from incorrect words to correct ones e.g. contractions.txt for English.",
)
parser.add_argument(
"--out_dir",
type=lambda p: Path(p).absolute(),
Expand All @@ -149,12 +195,72 @@ def dump_dictionary(out_path, lt_dir, tag_dict_path, tag_info_path):

write_freqlist(open(args.out_dir / "common.txt", "w"), args.lang_code)
copy_lt_files(args.out_dir, args.lt_dir, args.lang_code)
dump_dictionary(

# tagger dictionary
dump_dict(
args.out_dir / "tags" / "output.dump",
args.lt_dir,
args.tag_dict_path,
args.tag_info_path,
)

# spell dictionaries
(args.out_dir / "spell").mkdir()
for dic in glob(
str(
args.lt_dir
/ "org"
/ "languagetool"
/ "resource"
/ args.lang_code
/ "hunspell"
/ "*.dict"
)
):
dic = Path(dic)
info = Path(dic).with_suffix(".info")

variant_name = dic.stem

dump_dict(
args.out_dir / "spell" / f"{variant_name}.dump", args.lt_dir, dic, info,
)
proc_spelling_text(
[
(
dic / ".." / ("spelling_" + variant_name.replace("_", "-") + ".txt")
).resolve(),
(
dic / ".." / ("spelling-" + variant_name.replace("_", "-") + ".txt")
).resolve(),
],
args.out_dir / "spell" / f"{variant_name}.txt",
args.lang_code,
)

proc_spelling_text(
[
args.lt_dir
/ "org"
/ "languagetool"
/ "resource"
/ args.lang_code
/ "hunspell"
/ "spelling.txt"
],
args.out_dir / "spell" / "spelling.txt",
args.lang_code,
)

with open(args.out_dir / "spell" / "map.txt", "w") as f:
for path in args.spell_map_path or []:
for line in open(path):
if line.startswith("#"):
continue

assert "#" not in line
f.write(line)

if (
args.chunker_token_model is not None
and args.chunker_pos_model is not None
Expand Down
9 changes: 5 additions & 4 deletions build/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use flate2::bufread::GzDecoder;
use fs::File;
use fs_err as fs;
use nlprule::{compile, rules_filename, tokenizer_filename};
use std::fs::Permissions;
use std::{fs::Permissions, sync::Arc};
use std::{
io::{self, BufReader, BufWriter, Cursor, Read},
path::{Path, PathBuf},
Expand Down Expand Up @@ -469,10 +469,11 @@ impl BinaryBuilder {
let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code));
let rules_out = self.out_dir.join(rules_filename(lang_code));

nlprule::Rules::new(rules_out)
.map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?;
nlprule::Tokenizer::new(tokenizer_out)
let tokenizer = nlprule::Tokenizer::new(tokenizer_out)
.map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?;

nlprule::Rules::new(rules_out, Arc::new(tokenizer))
.map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?;
}

Ok(())
Expand Down
2 changes: 2 additions & 0 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ half = { version = "1.7", features = ["serde"] }
srx = { version = "^0.1.2", features = ["serde"] }
lazycell = "1"
cfg-if = "1"
fnv = "1"
unicode_categories = "0.1"

rayon-cond = "0.1"
rayon = "1.5"
Expand Down
1 change: 1 addition & 0 deletions nlprule/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ fn main() {
("tokenizer.json", "tokenizer_configs.json"),
("rules.json", "rules_configs.json"),
("tagger.json", "tagger_configs.json"),
("spellchecker.json", "spellchecker_configs.json"),
] {
let mut config_map: HashMap<String, serde_json::Value> = HashMap::new();

Expand Down
8 changes: 8 additions & 0 deletions nlprule/configs/de/spellchecker.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"variants": [
"de_AT",
"de_DE",
"de_CH"
],
"split_hyphens": true
}
3 changes: 2 additions & 1 deletion nlprule/configs/en/rules.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
"ignore_ids": [
"GRAMMAR/PRP_MD_NN/2",
"TYPOS/VERB_APOSTROPHE_S/3"
]
],
"split_hyphens": true
}
8 changes: 8 additions & 0 deletions nlprule/configs/en/spellchecker.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"variants": [
"en_GB",
"en_US",
"en_AU"
],
"split_hyphens": true
}
4 changes: 4 additions & 0 deletions nlprule/configs/es/spellchecker.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"variants": [],
"split_hyphens": true
}
9 changes: 6 additions & 3 deletions nlprule/src/bin/run.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use clap::Clap;
use nlprule::{rules::Rules, tokenizer::Tokenizer};

Expand All @@ -18,11 +20,12 @@ fn main() {
env_logger::init();
let opts = Opts::parse();

let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
let rules = Rules::new(opts.rules).unwrap();
let tokenizer = Arc::new(Tokenizer::new(opts.tokenizer).unwrap());
let mut rules = Rules::new(opts.rules, tokenizer.clone()).unwrap();
rules.spell_mut().options_mut().variant = Some(rules.spell().variant("en_GB").unwrap());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this very very un-ergonomic API.

A few particular points to avoid:

  • mutable options in public API
  • structs in public lib API
  • circular borrow fun (need rules.spell_mut() and rules.spell() at the same time)
  • combine artifacts that don't seem necessarily connected (spellcheck rules vs grammer / tokenizer rules)

A more rustic API imho would be:

// explicit type annotations for clarity
let spellcheck: Spellcheck = Spellcheck::new(path_to_spellcheck_rules)?;
// alt:
let spellcheck: Spellcheck = Spellcheck::from_reader(fs::File::opn(path_to_spellcheck_rules)?)?;
spellcheck.add_hunspell_dict

let previous: Option<Spellcheck> = rules.set_spell(spellcheck);
// and
rules.modify_spellcheck(move |spellcheck| Ok(spellcheck.extend_whitelist(words_iter)))?

Was there a particular reason to not split the spelling rules from the tokenization rules? A third file would not hurt and avoid having to (cargo-spellcheck 🎩 ) issues with deployment size - 900kB extra is a lot when the upper limit is 10MB.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll think about this. In my opinion a mutable SpellOptions struct where all mutations are guaranteed to be valid and the spellchecker is updated when it is dropped is more erognomic than something like rules.modify_spellcheck(move |spellcheck| Ok(spellcheck.extend_whitelist(words_iter)))?. E.g. it seamlessly allows all HashSet operations on the whitelist, otherwhise that would have to be reimplemented as *_whitelist methods.

mutable options in public API

Do you have some more information / source on this? I wouldn't have thought that this is bad.

circular borrow fun (need rules.spell_mut() and rules.spell() at the same time)

this wouldn't compile, you need the speller to get a Variant (the Variant is owned, it doesn't need the spellchecker), then you can use the Variant to set the options. But I'm also not perfectly happy with that part.

combine artifacts that don't seem necessarily connected (spellcheck rules vs grammer / tokenizer rules)

This is a very valid point. The reason is that .correct() and .suggest() on the Rules should easily be configurable to do both, grammar and spelling correction. The best fix would probably be some meta-struct to combine any amount of suggestion providers (where Rules and Spellcheck are each one suggestion provider) which is strongly related to #50. Defining and implementing that would take significantly more work than I want to do before nlprule can do spellchecking 🙂

A third file would not hurt

A third binary and a function Rules.set_spell really does seem like a good idea. But I would argue that it decreases ergonomics, at least as long as the Spell structure is still part of the Rules.

when the upper limit is 10MB.

I'd just ask the crates.io team to increase the size so you don't have to worry about this (especially since (I think) you distribute some hunspell data as well). They do it on a case by case basis (rust-lang/crates.io#195). I understand the concern, but I'd argue that it is already quite impressive to store multiple hundred thousand valid words + lemmas + part-of-speech tags, rules, a noun chunker etc. in < 10MB. An extra 900kb for spellchecking should very rarely be an issue.

Copy link
Owner Author

@bminixhofer bminixhofer Mar 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's one alternative API moving some of the changes anticipated for #50 to this PR without going completely out of scope:

A new trait Suggest:

trait Suggest {
    fn suggest(&self, text: &str, tokenizer: &Tokenizer) -> Vec<Suggestion>; // or iterator
    fn correct(&self, text: &str, tokenizer: &Tokenizer) -> String {
        // default impl to just apply all the suggestions
    }
}

then Rules and a new separate struct Spell implement Suggest. The spell struct implements from_reader, new, etc. like the Rules but with an additional argument to denote the variant: Spell::new("path/to/bin", "en_GB").

There is one new struct Correcter with the fields:

struct Correcter {
   suggesters: Vec<Box<dyn Suggest>>
}

that can be constructed with

let correcter = Correcter::new(vec![
    Rules::from_reader(...).into(),
    Spell::from_reader(..., "en_GB").into()
])?; // implements `FromIterator` too

and implements Suggest as well. Getting the inner suggesters would then have to be done with downcast-rs and look like:

let rules = correcter.get::<Rules>(0)?;
// and mutably:
let spell = correcter.get_mut::<Spell>(1)?;
// I'm not 100% sure if this works with downcast-rs but I think it should

Extending the whitelist etc. would stay the same as before:

spell.options_mut().whitelist.push("nlprule".into());

But would not need the guard anymore (so no Drop impl - I agree that mutable options with some guard are problematic because they hide cost from the user, this way I really don't see an issue). Setting the variant is fallible and would thus not be part of the options:

spell.set_variant("en_US")?;

The Python API would of course adjust accordingly keeping it in sync when possible.

The nice thing about all of this is that nothing is a breaking change since the Rules keeps the same methods, there are just additional Correcter and Spell structs. There are multiple problems though:

  • .correct and .suggest need the Tokenizer as second argument. These will by far be the most commonly used methods so I'd like them to be as simple as possible e.g. fn correct(text: &str) -> String and fn suggest(text: &str) -> Vec<Sugggestion> like in the current PR without having to explicitly keep the tokenizer around if you don't need it.
  • The downcast magic happening in the Correcter is not ideal but I believe it can not be avoided.
  • A third binary hurts ergonomics, especially because including a binary is quite verbose at the moment.
  • Spellchecking suggestions should have lower priority than suggestions from the grammar rules. This is not trivial to enforce with the Correcter and might require some notion of priority on the Suggestion.

What do you think?


let tokens = tokenizer.pipe(&opts.text);

println!("Tokens: {:#?}", tokens);
println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
println!("Suggestions: {:#?}", rules.suggest(&opts.text));
}
6 changes: 4 additions & 2 deletions nlprule/src/bin/test.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use clap::Clap;
use nlprule::{rules::Rules, tokenizer::Tokenizer};

Expand All @@ -19,8 +21,8 @@ fn main() {
env_logger::init();
let opts = Opts::parse();

let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
let rules_container = Rules::new(opts.rules).unwrap();
let tokenizer = Arc::new(Tokenizer::new(opts.tokenizer).unwrap());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iff the above API changes are implemented, the Arc should be able to be avoided. Imho Spellcheck should parse the file and maybe use a Tokenizer reference only when extracting items.

let rules_container = Rules::new(opts.rules, tokenizer.clone()).unwrap();
let rules = rules_container.rules();

println!("Runnable rules: {}", rules.len());
Expand Down
Loading