Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

feat: add sentence validator for catalan #606

Merged
merged 4 commits into from
Feb 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions server/lib/validation/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const defaultValidator = require('./languages/default');
const bas = require('./languages/bas');
const ca = require('./languages/ca');
const ckb = require('./languages/ckb');
const en = require('./languages/en');
const eo = require('./languages/eo');
Expand All @@ -16,6 +17,7 @@ const yue = require('./languages/yue');

const VALIDATORS = {
bas,
ca,
ckb,
en,
eo,
Expand Down
38 changes: 38 additions & 0 deletions server/lib/validation/languages/ca.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const tokenizeWords = require('talisman/tokenizers/words/gersam');

// Minimum of words that qualify as a sentence.
const MIN_WORDS = 1;

// Maximum of words allowed per sentence to keep recordings in a manageable duration.
const MAX_WORDS = 14;

const INVALIDATIONS = [{
fn: (sentence) => {
const words = tokenizeWords('ca', sentence);
return words.length < MIN_WORDS || words.length > MAX_WORDS;
},
error: `El nombre de paraules ha de ser entre ${MIN_WORDS} i ${MAX_WORDS} (inclòs)`,
}, {
regex: /[0-9]+/,
error: 'La frase no pot contenir nombres',
}, {
// This could mean multiple sentences per line.
regex: /[?!.].+/,
error: 'La frase no pot contenir signes de puntuació al mig',
}, {
// Symbols not allowed, also add them below as well to the regex:
// < > + * \ # @ ^ “ ” ‘ ’ ( ) [ ] / { }
regex: /[<>+*\\#@^“”‘’(){}[\]/]|\s{2,}|!{2,}/,
error: 'La frase no pot contenir simbols o multiples espais o exclamacions',
}, {
// Any words consisting of uppercase letters or uppercase letters with a period
// inbetween are considered abbreviations or acronyms.
// This currently also matches fooBAR but we most probably don't want that either
// as users wouldn't know how to pronounce the uppercase letters.
regex: /[A-Z]{2,}|[A-Z]+\.*[A-Z]+/,
error: 'La frase no pot contenir abreviacions o acrònims',
}];

module.exports = {
INVALIDATIONS,
};