Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

feat: add sentence validator for catalan #606

Merged
merged 4 commits into from
Feb 19, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions server/lib/validation/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const th = require('./languages/th');
const ur = require('./languages/ur');
const uz = require('./languages/uz');
const yue = require('./languages/yue');
const ca = require('./languages/ca');
jmigual marked this conversation as resolved.
Show resolved Hide resolved

const VALIDATORS = {
bas,
Expand All @@ -29,6 +30,7 @@ const VALIDATORS = {
ur,
uz,
yue,
ca
jmigual marked this conversation as resolved.
Show resolved Hide resolved
};

module.exports = {
Expand Down
38 changes: 38 additions & 0 deletions server/lib/validation/languages/ca.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const tokenizeWords = require('talisman/tokenizers/words/gersam');

// Minimum of words that qualify as a sentence.
const MIN_WORDS = 1;

// Maximum of words allowed per sentence to keep recordings in a manageable duration.
const MAX_WORDS = 14;

const INVALIDATIONS = [{
fn: (sentence) => {
const words = tokenizeWords('ca', sentence);
return words.length < MIN_WORDS || words.length > MAX_WORDS;
},
error: `Number of words must be between ${MIN_WORDS} and ${MAX_WORDS} (inclusive)`,
jmigual marked this conversation as resolved.
Show resolved Hide resolved
}, {
regex: /[0-9]+/,
error: 'Sentence should not contain numbers',
}, {
// This could mean multiple sentences per line.
regex: /[?!.].+/,
error: 'Sentence should not contain sentence punctuation inside a sentence',
}, {
// Symbols not allowed, also add them below as well to the regex:
// < > + * \ # @ ^ “ ” ‘ ’ ( ) [ ] / { }
regex: /[<>+*\\#@^“”‘’(){}[\]/]|\s{2,}|!{2,}/,
error: 'Sentence should not contain symbols or multiple spaces/exclamation marks',
}, {
// Any words consisting of uppercase letters or uppercase letters with a period
// inbetween are considered abbreviations or acronyms.
// This currently also matches fooBAR but we most probably don't want that either
// as users wouldn't know how to pronounce the uppercase letters.
regex: /[A-Z]{2,}|[A-Z]+\.*[A-Z]+/,
error: 'Sentence should not contain abbreviations',
}];

module.exports = {
INVALIDATIONS,
};