CHANGE: Retain emphasis by default (#154)

closes #154
digitallinguistics · Nov 18, 2023 · f4113d1 · f4113d1
1 parent 74ff10a
commit f4113d1
Show file tree

Hide file tree

Showing 21 changed files with 201 additions and 100 deletions.
diff --git a/.eslintrc.yml b/.eslintrc.yml
@@ -1,2 +1,4 @@
 extends:
-  - '@digitallinguistics/eslint-config'
+  - '@digitallinguistics/eslint-config'
+env:
+  mocha: true
diff --git a/README.md b/README.md
@@ -95,13 +95,14 @@ A JavaScript library that converts linguistic texts in [scription format][script
 
 ## Options
 
-Option              | Default     | Description
---------------------|-------------|----------------------------------------------------------------------------------------------------------------------
-`codes`             | `{}`        | This option allows you to use custom backslash codes in your interlinear glosses. It should be a hash containing the scription code as a key (without a leading backslash), and the custom code as the value; ex: `"txn": "t"` will allow you to write `\t` instead of `\txn` for transcription lines.
-`errors`            | `"warn"`    | This option allows you to specify how to handle errors. If set to `"warn""` (the default), an utterance which throws an error is skipped and a warning is logged to the console. If set to `"object"`, an error object with information is returned in the results array. If set to `false`, utterances with errors will be skipped silently. If set to `true`, utterances with errors will throw and stop further processing.
-`orthography`       | `"default"` | An abbreviation for the default orthography to use for transcriptions when one is not specified.
-`parser`            | `undefined` | A YAML parser to use in parsing the header of a scription document. If none is present, the header will be provided as a string in the `header` property of the returned object.
-`utteranceMetadata` | `true`      | Whether to parse the utterance metadata line (the first line when it begins with `#`). If set to `true`, a `metadata` property will be added to each utterance that has it.
+| Option              | Default     | Description                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `codes`             | `{}`        | This option allows you to use custom backslash codes in your interlinear glosses. It should be a hash containing the scription code as a key (without a leading backslash), and the custom code as the value; ex: `"txn": "t"` will allow you to write `\t` instead of `\txn` for transcription lines.                                                                                                                         |
+| `emphasis`          | `true`      | This option specifies whether emphasis should be passed through as-is (`true`, default), or stripped from the data (`false`).
+| `errors`            | `"warn"`    | This option allows you to specify how to handle errors. If set to `"warn""` (the default), an utterance which throws an error is skipped and a warning is logged to the console. If set to `"object"`, an error object with information is returned in the results array. If set to `false`, utterances with errors will be skipped silently. If set to `true`, utterances with errors will throw and stop further processing. |
+| `orthography`       | `"default"` | An abbreviation for the default orthography to use for transcriptions when one is not specified.                                                                                                                                                                                                                                                                                                                               |
+| `parser`            | `undefined` | A YAML parser to use in parsing the header of a scription document. If none is present, the header will be provided as a string in the `header` property of the returned object.                                                                                                                                                                                                                                               |
+| `utteranceMetadata` | `true`      | Whether to parse the utterance metadata line (the first line when it begins with `#`). If set to `true`, a `metadata` property will be added to each utterance that has it.                                                                                                                                                                                                                                                    |
 
 [actions]:   https://github.com/digitallinguistics/scription2dlx/actions/
 [AJV]:       https://www.npmjs.com/package/ajv

diff --git a/src/index.js b/src/index.js
@@ -11,6 +11,7 @@ import parseUtterances               from './parseUtterances.js'
  */
 export default function scription2dlx(scription = ``, {
   codes = {},
+  emphasis = true,
   errors = `warn`,
   orthography = `default`,
   parser,
@@ -26,6 +27,7 @@ export default function scription2dlx(scription = ``, {
   }
 
   const options = {
+    emphasis,
     errors,
     orthography,
     utteranceMetadata,

diff --git a/src/parseUtterance/index.js b/src/parseUtterance/index.js
@@ -26,12 +26,6 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
   const utterance = {}
   let lines       = [...rawLines]
 
-  const {
-    errors,
-    orthography,
-    utteranceMetadata,
-  } = options
-
   try {
 
     // metadata
@@ -42,7 +36,7 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
 
       const rawMetadata = lines.shift()
 
-      if (utteranceMetadata === true) {
+      if (options.utteranceMetadata === true) {
         const metadata = parseMetadata(rawMetadata)
         if (metadata) utterance.metadata = metadata
       }
@@ -86,24 +80,24 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
 
     // Transcript
     if (types.includes(`trs`)) {
-      utterance.transcript  = parseTranscript(codesHash.trs, lines, orthography)
+      utterance.transcript  = parseTranscript(codesHash.trs, lines, options)
     }
 
     // Transcription
-    utterance.transcription = parseTranscription(codesHash.txn, lines, orthography)
+    utterance.transcription = parseTranscription(codesHash.txn, lines, options)
 
     // Phonetic
     if (types.includes(`phon`) && schema.includes(`phon`)) {
-      utterance.phonetic = parsePhonetic(lines[codesHash.phon])
+      utterance.phonetic = parsePhonetic(lines[codesHash.phon], options)
     }
 
     // Literal Translation
     if (types.includes(`lit`)) {
-      utterance.literal = parseLiteral(codesHash.lit, lines)
+      utterance.literal = parseLiteral(codesHash.lit, lines, options)
     }
 
     // Free Translation
-    utterance.translation = parseTranslation(codesHash.tln, lines) || ``
+    utterance.translation = parseTranslation(codesHash.tln, lines, options) || ``
 
     // Source
     if (types.includes(`s`)) {
@@ -118,7 +112,7 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
     }
 
     // Words
-    const words = parseWords(codesHash, lines, orthography)
+    const words = parseWords(codesHash, lines, options)
     if (words.length) utterance.words = words
 
     // Notes
@@ -137,23 +131,23 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
     }
 
     if (typeof utterance.transcription === `string`) {
-      utterance.transcription = { [orthography]: utterance.transcription }
+      utterance.transcription = { [options.orthography]: utterance.transcription }
     }
 
     return utterance
 
   } catch (e) {
 
-    if (!errors) return
+    if (!options.errors) return
 
     e.text = rawLines.join(`\n`)
 
-    if (errors === `warn`) {
+    if (options.errors === `warn`) {
       console.warn(e)
       return
     }
 
-    if (errors === `object`) {
+    if (options.errors === `object`) {
       return e
     }
 

diff --git a/src/parseUtterance/parseLiteral.js b/src/parseUtterance/parseLiteral.js
@@ -4,14 +4,16 @@ import removeEmphasis from '../utilities/removeEmphasis.js'
 
 /**
  * Parses, validates, and cleans the literal translation lines
- * @param  {String}        lineCode The line code to use for literal lines
- * @param  {Object}        lines    The lines hash
+ * @param  {String}        lineCode         The line code to use for literal lines
+ * @param  {Object}        lines            The lines hash
+ * @param  {Object}        options          The options hash
+ * @param  {Boolean}       options.emphasis Whether to retain emphasis in the output
  * @return {String|Object}
  */
-export default function parseLiteral(lineCode, lines) {
+export default function parseLiteral(lineCode, lines, options) {
   let data = groupLines(lineCode, lines)
   if (!data) return null
-  data = removeEmphasis(data)
+  if (options.emphasis === false) data = removeEmphasis(data)
   // NB: Do not use the lineCode variable as the first argument to removeBrackets
   // removeBrackets accept an abstract type, not a line code, as its first argument.
   // Use both `lit` and `tln` to remove square brackets and multiple types of single quotes

diff --git a/src/parseUtterance/parseMorphemes/index.js b/src/parseUtterance/parseMorphemes/index.js
@@ -49,12 +49,13 @@ function tokenizeWord(string) {
 
 /**
  * Accepts an lines hash for a word (morpheme and gloss lines) and returns an array of DLx Morpheme objects
- * @param  {Object} codes       The hash of line codes
- * @param  {Object} wordHash    The lines hash for the word
- * @param  {String} orthogrpahy The abbreviation to use for the default orthography, if none is provided.
- * @return {Array}              Returns an array of DLx Morpheme objeccts
+ * @param  {Object} codes               The hash of line codes
+ * @param  {Object} wordHash            The lines hash for the word
+ * @param  {Object} options             The options hash
+ * @param  {String} options.orthography The abbreviation to use for the default orthography, if none is provided.
+ * @return {Array}                      Returns an array of DLx Morpheme objeccts
  */
-export default function parseMorphemes(codes, wordHash, orthography) {
+export default function parseMorphemes(codes, wordHash, options) {
 
   const morphemeLines = getLines([codes.gl, codes.m], wordHash)
 
@@ -72,7 +73,7 @@ export default function parseMorphemes(codes, wordHash, orthography) {
     let transcription = groupLines(codes.m, data) || ``
     transcription     = removeBrackets(`infix`, transcription)
 
-    if (typeof transcription === `string`) transcription = { [orthography]: transcription }
+    if (typeof transcription === `string`) transcription = { [options.orthography]: transcription }
 
     return {
       transcription,

diff --git a/src/parseUtterance/parsePhonetic.js b/src/parseUtterance/parsePhonetic.js
@@ -3,12 +3,15 @@ import removeEmphasis from '../utilities/removeEmphasis.js'
 
 /**
  * Cleans and validates the phonetic line
- * @param  {String} line The phonetic line
+ * @param  {String}  line             The phonetic line
+ * @param  {Object}  options          The options hash
+ * @param  {Boolean} options.emphasis Whether to retain emphasis in the output
  * @return {String}
  */
-export default function parsePhonetic(line) {
+export default function parsePhonetic(line, options) {
   if (!line) return null
   // NB: Do not use the lineCode variable as the first argument to removeBrackets
   // removeBrackets accept an abstract type, not a line code, as its first argument
-  return removeBrackets(`phon`, removeEmphasis(line))
+  if (options.emphasis === false) line = removeEmphasis(line)
+  return removeBrackets(`phon`, line)
 }
diff --git a/src/parseUtterance/parseTranscript.js b/src/parseUtterance/parseTranscript.js
@@ -2,16 +2,17 @@ import groupLines from '../utilities/groupLines.js'
 
 /**
  * Extracts, validates, and cleans the transcript lines from the lines hash
- * @param  {String}        lineCode    The line code for the transcript lines
- * @param  {Object}        lines       The lines hash
- * @param  {String}        orthography The abbreviation to use for the transcript orthography if none is present
+ * @param  {String}        lineCode            The line code for the transcript lines
+ * @param  {Object}        lines               The lines hash
+ * @param  {Object}        options             The options hash
+ * @param  {String}        options.orthography The abbreviation to use for the transcript orthography if none is present
  * @return {String|Object}
  */
-export default function parseTranscript(lineCode, lines, orthography) {
+export default function parseTranscript(lineCode, lines, options) {
 
   const data = groupLines(lineCode, lines)
 
-  if (typeof data === `string`) return { [orthography]: data }
+  if (typeof data === `string`) return { [options.orthography]: data }
 
   return data || null
 

diff --git a/src/parseUtterance/parseTranscription.js b/src/parseUtterance/parseTranscription.js
@@ -4,23 +4,24 @@ import removeEmphasis from '../utilities/removeEmphasis.js'
 
 /**
  * Extracts, validates, and cleans the transcription lines from the lines hash
- * @param  {String}        lineCode    The code to use for transcription lines
- * @param  {Object}        lines       The lines hash
- * @param  {String}        orthography An abbreviation to use for the orthography if one is not specified
+ * @param  {String}        lineCode            The code to use for transcription lines
+ * @param  {Object}        lines               The lines hash
+ * @param  {Object}        options             The options hash
+ * @param  {String}        options.orthography An abbreviation to use for the orthography if one is not specified
  * @return {String|Object}
  */
-export default function parseTranscription(lineCode, lines, orthography) {
+export default function parseTranscription(lineCode, lines, options) {
 
   let data = groupLines(lineCode, lines)
 
   if (!data) return null
 
   // NB: Do not use lineCode for removeBrackets here
   // removeBrackets takes an abstract type, not a line code, as its first argument
-  data = removeEmphasis(data)
+  if (options.emphasis === false) data = removeEmphasis(data)
   data = removeBrackets(`txn`, data)
 
-  if (typeof data === `string`) return { [orthography]: data }
+  if (typeof data === `string`) return { [options.orthography]: data }
 
   return data
 

diff --git a/src/parseUtterance/parseTranslation.js b/src/parseUtterance/parseTranslation.js
@@ -4,14 +4,16 @@ import removeEmphasis from '../utilities/removeEmphasis.js'
 
 /**
  * Extracts, validates, and cleans the translation lines from the lines hash
- * @param  {String}        lineCode The line code to use for translations lines
- * @param  {Object}        lines    The lines hash
+ * @param  {String}        lineCode         The line code to use for translations lines
+ * @param  {Object}        lines            The lines hash
+ * @param  {Object}        options          The options hash
+ * @param  {Boolean}       options.emphasis Whether to retain emphasis in the output
  * @return {String|Object}
  */
-export default function parseTranslation(lineCode, lines) {
+export default function parseTranslation(lineCode, lines, options) {
   let data = groupLines(lineCode, lines)
   if (!data) return null
-  data = removeEmphasis(data)
+  if (options.emphasis === false) data = removeEmphasis(data)
   // NB: removeBrackets accepts an abstract line type, not a specific line code
   // as its first argument
   data = removeBrackets(`tln`, data)

diff --git a/src/parseUtterance/parseWords.js b/src/parseUtterance/parseWords.js
@@ -8,23 +8,25 @@ import zip              from '../utilities/js/zip.js'
 
 /**
  * Parses the word hash into a DLx Word object
- * @param  {Object} codes       The hash of line codes
- * @param  {Object} data        The word hash
- * @param  {String} orthography The orthography abbreviation to use if one is not specified
- * @return {Object}             Returns a DLx Word object
+ * @param  {Object}  codes               The hash of line codes
+ * @param  {Object}  data                The word hash
+ * @param  {Object}  options             The options hash
+ * @param  {Boolean} options.emphasis    Whether to retain empahsis in the output
+ * @param  {String}  options.orthography The orthography abbreviation to use if one is not specified
+ * @return {Object}                      Returns a DLx Word object
  */
-function parseWord(codes, data, orthography) {
+function parseWord(codes, data, options) {
 
-  data = removeEmphasis(data) // eslint-disable-line no-param-reassign
+  if (options.emphasis === false) data = removeEmphasis(data) // eslint-disable-line no-param-reassign
 
   let transcription = groupLines(codes.w, data) || ``
   let analysis      = groupLines(codes.m, data)
   const gloss       = groupLines(codes.gl, data)
   const literal     = groupLines(codes.wlt, data)
-  const morphemes   = parseMorphemes(codes, data, orthography)
+  const morphemes   = parseMorphemes(codes, data, options)
 
-  if (typeof transcription === `string`) transcription = { [orthography]: transcription }
-  if (typeof analysis === `string`) analysis = { [orthography]: analysis }
+  if (typeof transcription === `string`) transcription = { [options.orthography]: transcription }
+  if (typeof analysis === `string`) analysis = { [options.orthography]: analysis }
 
   return {
     transcription,
@@ -52,12 +54,13 @@ function tokenizeLine(string) {
 
 /**
  * Extracts word-specific lines from the lines hash and converts them into an array of DLx Word objects
- * @param  {Object} codes       The line codes hash
- * @param  {Object} lines       The lines hash
- * @param  {String} orthography The abbreviation to use for the orthography if one is not specified
- * @return {Array}              Returns an array of DLx Word objects
+ * @param  {Object} codes               The line codes hash
+ * @param  {Object} lines               The lines hash
+ * @param  {Object} options             The options hash
+ * @param  {String} options.orthography The abbreviation to use for the orthography if one is not specified
+ * @return {Array}                      Returns an array of DLx Word objects
  */
-export default function parseWords(codesHash, lines, orthography) {
+export default function parseWords(codesHash, lines, options) {
 
   const wordLineCodes = wordTypes.map(type => codesHash[type])
   const wordLines     = getLines(wordLineCodes, lines)
@@ -74,6 +77,6 @@ export default function parseWords(codesHash, lines, orthography) {
   validateNumItems(wordsHash)
 
   return zip(wordsHash)
-  .map(wordData => parseWord(codesHash, wordData, orthography))
+  .map(wordData => parseWord(codesHash, wordData, options))
 
 }
diff --git a/test/lines/glosses.test.js b/test/lines/glosses.test.js
@@ -216,7 +216,7 @@ describe(`glosses`, () => {
 
   })
 
-   
+
   it(`tokenizes words and morphemes correctly`, () => {
 
     const text = `
@@ -270,7 +270,7 @@ describe(`glosses`, () => {
     expect(w4m.gloss).to.equal(`null`)
 
   })
-   
+
 
   it(`populates both word and morpheme glosses`, () => {
 
@@ -289,18 +289,35 @@ describe(`glosses`, () => {
 
   })
 
-  it(`is stripped of emphasis`, () => {
+  it(`retains emphasis`, function() {
 
-    const wordGloss = `waxt-qungu`
+    const withEmphasis    = `*waxt*-qungu`
+    const withoutEmphasis = `waxt-qungu`
 
     const text = `
-    \\m ${ wordGloss } qasi
-    \\gl *waxt*-qungu qasi
+    \\m  ${ withoutEmphasis } qasi
+    \\gl ${ withEmphasis } qasi
     `
 
-    const { utterances: [{ words: [word] }] } = convert(text)
+    const { utterances: [{ words: [word] }] } = convert(text, { emphasis: true })
+
+    expect(word.gloss).to.equal(withEmphasis)
+
+  })
+
+  it(`strips emphasis`, () => {
+
+    const withEmphasis    = `*waxt*-qungu`
+    const withoutEmphasis = `waxt-qungu`
+
+    const text = `
+    \\m  ${ withoutEmphasis } qasi
+    \\gl ${ withEmphasis } qasi
+    `
+
+    const { utterances: [{ words: [word] }] } = convert(text, { emphasis: false })
 
-    expect(word.gloss).to.equal(wordGloss)
+    expect(word.gloss).to.equal(withoutEmphasis)
 
   })