add validation option to PO parsing (closes #75)

* rc0 * rc1 * rc2 * rc3 * review comments -- use full error messages instead of regex; revert change in .eslintrc.js * handle optional parameters shifting * rc4 * review comments -- adjusted argument parsing logic * review comments -- simplificatyling public api functions signatures * review comments -- simplifying further public api and updated Readme
smhg · Mar 29, 2023 · 456b3c5 · 456b3c5
1 parent 2026f4a
commit 456b3c5
Show file tree

Hide file tree

Showing 15 changed files with 427 additions and 47 deletions.
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -13,6 +13,6 @@ module.exports = {
     'ecmaVersion': 2018
   },
   'rules': {
-    'semi': ['error', 'always']
+    'semi': ['error', 'always'],
   }
 }
diff --git a/README.md b/README.md
@@ -11,17 +11,23 @@ Include the library:
 
     var gettextParser = require("gettext-parser");
 
-
 ### Parse PO files
 
 Parse a PO file with
 
-    gettextParser.po.parse(input[, defaultCharset]) → Object
+    gettextParser.po.parse(input[, options]) → Object
 
 Where
 
   * **input** is a *po* file as a Buffer or an unicode string. Charset is converted to unicode from other encodings only if the input is a Buffer, otherwise the charset information is discarded
-  * **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"` (applies only if *input* is a Buffer)
+  * **options** is an optional objects with the following optinal properties:
+    * **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"` (applies only if *input* is a Buffer)
+    * **validation** is a flag to turn on PO source file validation. The validation makes sure that:
+
+      * there is exactly zero or one `msgid_plural` definition per translation entry; a `Multiple msgid_plural error` error gets thrown otherwise.
+      * there are no duplicate entries with exact `msgid` values; a `Duplicate msgid error` error gets thrown otherwise.
+      * the number of plural forms matches exactly the number from `nplurals` defined in `Plural-Forms` header for entries that have plural forms; a `Plural forms range error` error gets thrown otherwise.
+      * the number of `msgstr` matches exacty the one (if `msgid_plural` is not defined) or the number from `nplurals` (if `msgid_plural` is defined); a `Translation string range error` error gets thrown otherwise.
 
 Method returns gettext-parser specific translation object (see below)
 
@@ -37,12 +43,12 @@ console.log(po.translations['']); // output translations for the default context
 
 PO files can also be parsed from a stream source. After all input is processed the parser emits a single 'data' event which contains the parsed translation object.
 
-    gettextParser.po.createParseStream([defaultCharset][, streamOptions]) → Transform Stream
+    gettextParser.po.createParseStream([options][, transformOptions]) → Transform Stream
 
 Where
 
-  * **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"`
-  * **streamOptions** are the standard stream options
+  * **options** is an optional objects, same as in `parse`. See [Parse PO files](#parse-po-files) section for details.
+  * **transformOptions** are the standard stream options.
 
 **Example**
 
@@ -80,6 +86,8 @@ var output = gettextParser.po.compile(data);
 require('fs').writeFileSync('filename.po', output);
 ```
 
+### 
+
 ### Parse MO files
 
 Parse a MO file with

diff --git a/lib/poparser.js b/lib/poparser.js
@@ -3,39 +3,44 @@ const sharedFuncs = require('./shared');
 const Transform = require('readable-stream').Transform;
 const util = require('util');
 
+module.exports.parse = parse;
+module.exports.stream = stream;
+
 /**
  * Parses a PO object into translation table
  *
- * @param {Buffer|String} buffer PO object
- * @param {String} [defaultCharset] Default charset to use
- * @return {Object} Translation object
+ * @typedef {{ defaultCharset?: string, validation?: boolean }} Options
+ * @param {string | Buffer} input PO object
+ * @param {Options} [options] Optional options with defaultCharset and validation
  */
-module.exports.parse = function (buffer, defaultCharset) {
-  const parser = new Parser(buffer, defaultCharset);
+function parse (input, options = {}) {
+  const parser = new Parser(input, options);
 
   return parser.parse();
 };
 
 /**
  * Parses a PO stream, emits translation table in object mode
  *
- * @param {String} [defaultCharset] Default charset to use
- * @param {String} [options] Stream options
- * @return {Stream} Transform stream
+ * @typedef {{ defaultCharset: strubg, validation: boolean }} Options
+ * @param {Options} [options] Optional options with defaultCharset and validation
+ * @param {import('readable-stream').TransformOptions} [transformOptions] Optional stream options
  */
-module.exports.stream = function (defaultCharset, options) {
-  return new PoParserTransform(defaultCharset, options);
+function stream (options = {}, transformOptions = {}) {
+  return new PoParserTransform(options, transformOptions);
 };
 
 /**
  * Creates a PO parser object. If PO object is a string,
  * UTF-8 will be used as the charset
  *
+ * @typedef {{ defaultCharset?: string, validation?: boolean }} Options
  * @constructor
- * @param {Buffer|String} fileContents PO object
- * @param {String} [defaultCharset] Default charset to use
+ * @param {string | Buffer} fileContents PO object
+ * @param {Options} options Options with defaultCharset and validation
  */
-function Parser (fileContents, defaultCharset = 'iso-8859-1') {
+function Parser (fileContents, { defaultCharset = 'iso-8859-1', validation = false }) {
+  this._validation = validation;
   this._charset = defaultCharset;
 
   this._lex = [];
@@ -378,6 +383,10 @@ Parser.prototype._handleValues = function (tokens) {
       response.push(lastNode);
     } else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
       if (lastNode) {
+        if (this._validation && 'msgid_plural' in lastNode) {
+          throw new SyntaxError(`Multiple msgid_plural error: entry "${lastNode.msgid}" in "${lastNode.msgctxt || ''}" context has multiple msgid_plural declarations.`);
+        }
+
         lastNode.msgid_plural = tokens[i].value;
       }
 
@@ -404,6 +413,41 @@ Parser.prototype._handleValues = function (tokens) {
   return response;
 };
 
+/**
+ * Validate token
+ *
+ * @param {Object} token Parsed token
+ * @param {Object} translations Translation table
+ * @param {string} msgctxt Message entry context
+ * @param {number} nplurals Number of epected plural forms
+ * @throws Will throw an error if token validation fails
+ */
+Parser.prototype._validateToken = function (
+  {
+    msgid = '',
+    msgid_plural = '', // eslint-disable-line camelcase
+    msgstr = []
+  },
+  translations,
+  msgctxt,
+  nplurals
+) {
+  if (!this._validation) {
+    return;
+  }
+
+  if (msgid in translations[msgctxt]) {
+    throw new SyntaxError(`Duplicate msgid error: entry "${msgid}" in "${msgctxt}" context has already been declared.`);
+    // eslint-disable-next-line camelcase
+  } else if (msgid_plural && msgstr.length !== nplurals) {
+    // eslint-disable-next-line camelcase
+    throw new RangeError(`Plural forms range error: Expected to find ${nplurals} forms but got ${msgstr.length} for entry "${msgid_plural}" in "${msgctxt}" context.`);
+    // eslint-disable-next-line camelcase
+  } else if (!msgid_plural && msgstr.length !== 1) {
+    throw new RangeError(`Translation string range error: Extected 1 msgstr definitions associated with "${msgid}" in "${msgctxt}" context, found ${msgstr.length}.`);
+  }
+};
+
 /**
  * Compose a translation table from tokens object
  *
@@ -416,6 +460,7 @@ Parser.prototype._normalize = function (tokens) {
     headers: undefined,
     translations: {}
   };
+  let nplurals = 1;
   let msgctxt;
 
   for (let i = 0, len = tokens.length; i < len; i++) {
@@ -443,8 +488,11 @@ Parser.prototype._normalize = function (tokens) {
 
     if (!table.headers && !msgctxt && !tokens[i].msgid) {
       table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
+      nplurals = sharedFuncs.parseNPluralFromHeadersSafely(table.headers, nplurals);
     }
 
+    this._validateToken(tokens[i], table.translations, msgctxt, nplurals);
+
     table.translations[msgctxt][tokens[i].msgid] = tokens[i];
   }
 
@@ -471,26 +519,22 @@ Parser.prototype._finalize = function (tokens) {
 /**
  * Creates a transform stream for parsing PO input
  *
+ * @typedef {{ defaultCharset: strubg, validation: boolean }} Options
  * @constructor
- * @param {String} [defaultCharset] Default charset to use
- * @param {String} [options] Stream options
+ * @param {Options} options Optional options with defaultCharset and validation
+ * @param {import('readable-stream').TransformOptions} transformOptions Optional stream options
  */
-function PoParserTransform (defaultCharset, options) {
-  if (!options && defaultCharset && typeof defaultCharset === 'object') {
-    options = defaultCharset;
-    defaultCharset = undefined;
-  }
-
-  this.defaultCharset = defaultCharset;
+function PoParserTransform (options, transformOptions) {
+  this.options = options;
   this._parser = false;
   this._tokens = {};
 
   this._cache = [];
   this._cacheSize = 0;
 
-  this.initialTreshold = options.initialTreshold || 2 * 1024;
+  this.initialTreshold = transformOptions.initialTreshold || 2 * 1024;
 
-  Transform.call(this, options);
+  Transform.call(this, transformOptions);
   this._writableState.objectMode = false;
   this._readableState.objectMode = true;
 }
@@ -520,7 +564,7 @@ PoParserTransform.prototype._transform = function (chunk, encoding, done) {
       this._cache = [];
     }
 
-    this._parser = new Parser(chunk, this.defaultCharset);
+    this._parser = new Parser(chunk, this.options);
   } else if (this._cacheSize) {
     // this only happens if we had an uncompleted 8bit sequence from the last iteration
     this._cache.push(chunk);
@@ -573,7 +617,7 @@ PoParserTransform.prototype._flush = function (done) {
   }
 
   if (!this._parser && chunk) {
-    this._parser = new Parser(chunk, this.defaultCharset);
+    this._parser = new Parser(chunk, this.options);
   }
 
   if (chunk) {

diff --git a/lib/shared.js b/lib/shared.js
@@ -1,10 +1,12 @@
 module.exports.parseHeader = parseHeader;
+module.exports.parseNPluralFromHeadersSafely = parseNPluralFromHeadersSafely;
 module.exports.generateHeader = generateHeader;
 module.exports.formatCharset = formatCharset;
 module.exports.foldLine = foldLine;
 module.exports.compareMsgid = compareMsgid;
 
 // see https://www.gnu.org/software/gettext/manual/html_node/Header-Entry.html
+const PLURAL_FORMS = 'Plural-Forms';
 const HEADERS = new Map([
   ['project-id-version', 'Project-Id-Version'],
   ['report-msgid-bugs-to', 'Report-Msgid-Bugs-To'],
@@ -15,11 +17,13 @@ const HEADERS = new Map([
   ['language', 'Language'],
   ['content-type', 'Content-Type'],
   ['content-transfer-encoding', 'Content-Transfer-Encoding'],
-  ['plural-forms', 'Plural-Forms']
+  ['plural-forms', PLURAL_FORMS]
 ]);
 
 module.exports.HEADERS = HEADERS;
 
+const PLURAL_FORM_HEADER_NPLURALS_REGEX = /nplurals\s*=\s*(?<nplurals>\d+)/;
+
 /**
  * Parses a header string into an object of key-value pairs
  *
@@ -44,6 +48,26 @@ function parseHeader (str = '') {
     }, {});
 }
 
+/**
+ * Attempts to safely parse 'nplurals" value from "Plural-Forms" header
+ *
+ * @param {Object} [headers = {}] An object with parsed headers
+ * @returns {number} Parsed result
+ */
+function parseNPluralFromHeadersSafely (headers = {}, fallback = 1) {
+  const pluralForms = headers[PLURAL_FORMS];
+
+  if (!pluralForms) {
+    return fallback;
+  }
+
+  const {
+    groups: { nplurals } = { nplurals: '' + fallback }
+  } = pluralForms.match(PLURAL_FORM_HEADER_NPLURALS_REGEX) || {};
+
+  return parseInt(nplurals, 10) || fallback;
+}
+
 /**
  * Joins a header object of key value pairs into a header string
  *
@@ -128,6 +152,7 @@ function foldLine (str, maxLen = 76) {
 
 /**
  * Comparator function for comparing msgid
+ *
  * @param {Object} object with msgid prev
  * @param {Object} object with msgid next
  * @returns {number} comparator index

diff --git a/test/fixtures/headers-known.json b/test/fixtures/headers-known.json
@@ -0,0 +1,21 @@
+{
+    "charset": "utf-8",
+    "headers": {
+        "Project-Id-Version": "project 1.0.2",
+        "Content-Type": "text/plain; charset=utf-8",
+        "Content-Transfer-Encoding": "8bit",
+        "Plural-Forms": "nplurals=2; plural=(n!=1);",
+        "Mime-Version": "1.0",
+        "X-Poedit-SourceCharset": "UTF-8"
+    },
+    "translations": {
+        "": {
+            "": {
+                "msgid": "",
+                "msgstr": [
+                    "Project-Id-Version: project 1.0.2\nContent-Type: text/plain; charset=utf-8\nContent-Transfer-Encoding: 8bit\nPlural-Forms: nplurals=2; plural=(n!=1);\nMime-Version: 1.0\nX-Poedit-SourceCharset: UTF-8\n"
+                ]
+            }
+        }
+    }
+}
diff --git a/test/fixtures/headers-known.po b/test/fixtures/headers-known.po
@@ -0,0 +1,8 @@
+msgid ""
+msgstr ""
+"Project-Id-Version: project 1.0.2\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n!=1);\n"
+"Mime-Version: 1.0\n"
+"X-Poedit-SourceCharset: UTF-8\n"
diff --git a/test/fixtures/validate-context-duplicate-entries.po b/test/fixtures/validate-context-duplicate-entries.po
@@ -0,0 +1,29 @@
+msgid ""
+msgstr ""
+"Project-Id-Version: project 1.0.2\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n!=1);\n"
+"Mime-Version: 1.0\n"
+"X-Poedit-SourceCharset: UTF-8\n"
+
+# Plural string in "c1" context
+msgctxt "c1"
+msgid "o1-1"
+msgid_plural "o1-2"
+msgstr[0] "t1-1"
+msgstr[1] "t1-2"
+
+# Plural string in "c2" context
+msgctxt "c2"
+msgid "o1-1"
+msgid_plural "o1-2"
+msgstr[0] "t1-1"
+msgstr[1] "t1-2"
+
+# Plural string duplicate in "c2" context
+msgctxt "c2"
+msgid "o1-1"
+msgid_plural "o1-2"
+msgstr[0] "t1-1"
+msgstr[1] "t1-2"
diff --git a/test/fixtures/validate-duplicate-msgid.po b/test/fixtures/validate-duplicate-msgid.po
@@ -0,0 +1,16 @@
+msgid ""
+msgstr ""
+"Project-Id-Version: project 1.0.2\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n!=1);\n"
+"Mime-Version: 1.0\n"
+"X-Poedit-SourceCharset: UTF-8\n"
+
+# Normal string
+msgid "o1"
+msgstr "t1"
+
+# Normal string duplicate
+msgid "o1"
+msgstr "t2"