Skip to content

Commit

Permalink
CHANGE: Retain emphasis by default (#154)
Browse files Browse the repository at this point in the history
closes #154
  • Loading branch information
dwhieb committed Nov 18, 2023
1 parent 74ff10a commit f4113d1
Show file tree
Hide file tree
Showing 21 changed files with 201 additions and 100 deletions.
4 changes: 3 additions & 1 deletion .eslintrc.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
extends:
- '@digitallinguistics/eslint-config'
- '@digitallinguistics/eslint-config'
env:
mocha: true
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,14 @@ A JavaScript library that converts linguistic texts in [scription format][script

## Options

Option | Default | Description
--------------------|-------------|----------------------------------------------------------------------------------------------------------------------
`codes` | `{}` | This option allows you to use custom backslash codes in your interlinear glosses. It should be a hash containing the scription code as a key (without a leading backslash), and the custom code as the value; ex: `"txn": "t"` will allow you to write `\t` instead of `\txn` for transcription lines.
`errors` | `"warn"` | This option allows you to specify how to handle errors. If set to `"warn""` (the default), an utterance which throws an error is skipped and a warning is logged to the console. If set to `"object"`, an error object with information is returned in the results array. If set to `false`, utterances with errors will be skipped silently. If set to `true`, utterances with errors will throw and stop further processing.
`orthography` | `"default"` | An abbreviation for the default orthography to use for transcriptions when one is not specified.
`parser` | `undefined` | A YAML parser to use in parsing the header of a scription document. If none is present, the header will be provided as a string in the `header` property of the returned object.
`utteranceMetadata` | `true` | Whether to parse the utterance metadata line (the first line when it begins with `#`). If set to `true`, a `metadata` property will be added to each utterance that has it.
| Option | Default | Description |
| ------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `codes` | `{}` | This option allows you to use custom backslash codes in your interlinear glosses. It should be a hash containing the scription code as a key (without a leading backslash), and the custom code as the value; ex: `"txn": "t"` will allow you to write `\t` instead of `\txn` for transcription lines. |
| `emphasis` | `true` | This option specifies whether emphasis should be passed through as-is (`true`, default), or stripped from the data (`false`).
| `errors` | `"warn"` | This option allows you to specify how to handle errors. If set to `"warn""` (the default), an utterance which throws an error is skipped and a warning is logged to the console. If set to `"object"`, an error object with information is returned in the results array. If set to `false`, utterances with errors will be skipped silently. If set to `true`, utterances with errors will throw and stop further processing. |
| `orthography` | `"default"` | An abbreviation for the default orthography to use for transcriptions when one is not specified. |
| `parser` | `undefined` | A YAML parser to use in parsing the header of a scription document. If none is present, the header will be provided as a string in the `header` property of the returned object. |
| `utteranceMetadata` | `true` | Whether to parse the utterance metadata line (the first line when it begins with `#`). If set to `true`, a `metadata` property will be added to each utterance that has it. |

[actions]: https://github.com/digitallinguistics/scription2dlx/actions/
[AJV]: https://www.npmjs.com/package/ajv
Expand Down
2 changes: 2 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import parseUtterances from './parseUtterances.js'
*/
export default function scription2dlx(scription = ``, {
codes = {},
emphasis = true,
errors = `warn`,
orthography = `default`,
parser,
Expand All @@ -26,6 +27,7 @@ export default function scription2dlx(scription = ``, {
}

const options = {
emphasis,
errors,
orthography,
utteranceMetadata,
Expand Down
28 changes: 11 additions & 17 deletions src/parseUtterance/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
const utterance = {}
let lines = [...rawLines]

const {
errors,
orthography,
utteranceMetadata,
} = options

try {

// metadata
Expand All @@ -42,7 +36,7 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {

const rawMetadata = lines.shift()

if (utteranceMetadata === true) {
if (options.utteranceMetadata === true) {
const metadata = parseMetadata(rawMetadata)
if (metadata) utterance.metadata = metadata
}
Expand Down Expand Up @@ -86,24 +80,24 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {

// Transcript
if (types.includes(`trs`)) {
utterance.transcript = parseTranscript(codesHash.trs, lines, orthography)
utterance.transcript = parseTranscript(codesHash.trs, lines, options)
}

// Transcription
utterance.transcription = parseTranscription(codesHash.txn, lines, orthography)
utterance.transcription = parseTranscription(codesHash.txn, lines, options)

// Phonetic
if (types.includes(`phon`) && schema.includes(`phon`)) {
utterance.phonetic = parsePhonetic(lines[codesHash.phon])
utterance.phonetic = parsePhonetic(lines[codesHash.phon], options)
}

// Literal Translation
if (types.includes(`lit`)) {
utterance.literal = parseLiteral(codesHash.lit, lines)
utterance.literal = parseLiteral(codesHash.lit, lines, options)
}

// Free Translation
utterance.translation = parseTranslation(codesHash.tln, lines) || ``
utterance.translation = parseTranslation(codesHash.tln, lines, options) || ``

// Source
if (types.includes(`s`)) {
Expand All @@ -118,7 +112,7 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
}

// Words
const words = parseWords(codesHash, lines, orthography)
const words = parseWords(codesHash, lines, options)
if (words.length) utterance.words = words

// Notes
Expand All @@ -137,23 +131,23 @@ export default function parseUtterance(rawLines, schema, codesHash, options) {
}

if (typeof utterance.transcription === `string`) {
utterance.transcription = { [orthography]: utterance.transcription }
utterance.transcription = { [options.orthography]: utterance.transcription }
}

return utterance

} catch (e) {

if (!errors) return
if (!options.errors) return

e.text = rawLines.join(`\n`)

if (errors === `warn`) {
if (options.errors === `warn`) {
console.warn(e)
return
}

if (errors === `object`) {
if (options.errors === `object`) {
return e
}

Expand Down
10 changes: 6 additions & 4 deletions src/parseUtterance/parseLiteral.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ import removeEmphasis from '../utilities/removeEmphasis.js'

/**
* Parses, validates, and cleans the literal translation lines
* @param {String} lineCode The line code to use for literal lines
* @param {Object} lines The lines hash
* @param {String} lineCode The line code to use for literal lines
* @param {Object} lines The lines hash
* @param {Object} options The options hash
* @param {Boolean} options.emphasis Whether to retain emphasis in the output
* @return {String|Object}
*/
export default function parseLiteral(lineCode, lines) {
export default function parseLiteral(lineCode, lines, options) {
let data = groupLines(lineCode, lines)
if (!data) return null
data = removeEmphasis(data)
if (options.emphasis === false) data = removeEmphasis(data)
// NB: Do not use the lineCode variable as the first argument to removeBrackets
// removeBrackets accept an abstract type, not a line code, as its first argument.
// Use both `lit` and `tln` to remove square brackets and multiple types of single quotes
Expand Down
13 changes: 7 additions & 6 deletions src/parseUtterance/parseMorphemes/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,13 @@ function tokenizeWord(string) {

/**
* Accepts an lines hash for a word (morpheme and gloss lines) and returns an array of DLx Morpheme objects
* @param {Object} codes The hash of line codes
* @param {Object} wordHash The lines hash for the word
* @param {String} orthogrpahy The abbreviation to use for the default orthography, if none is provided.
* @return {Array} Returns an array of DLx Morpheme objeccts
* @param {Object} codes The hash of line codes
* @param {Object} wordHash The lines hash for the word
* @param {Object} options The options hash
* @param {String} options.orthography The abbreviation to use for the default orthography, if none is provided.
* @return {Array} Returns an array of DLx Morpheme objeccts
*/
export default function parseMorphemes(codes, wordHash, orthography) {
export default function parseMorphemes(codes, wordHash, options) {

const morphemeLines = getLines([codes.gl, codes.m], wordHash)

Expand All @@ -72,7 +73,7 @@ export default function parseMorphemes(codes, wordHash, orthography) {
let transcription = groupLines(codes.m, data) || ``
transcription = removeBrackets(`infix`, transcription)

if (typeof transcription === `string`) transcription = { [orthography]: transcription }
if (typeof transcription === `string`) transcription = { [options.orthography]: transcription }

return {
transcription,
Expand Down
9 changes: 6 additions & 3 deletions src/parseUtterance/parsePhonetic.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ import removeEmphasis from '../utilities/removeEmphasis.js'

/**
* Cleans and validates the phonetic line
* @param {String} line The phonetic line
* @param {String} line The phonetic line
* @param {Object} options The options hash
* @param {Boolean} options.emphasis Whether to retain emphasis in the output
* @return {String}
*/
export default function parsePhonetic(line) {
export default function parsePhonetic(line, options) {
if (!line) return null
// NB: Do not use the lineCode variable as the first argument to removeBrackets
// removeBrackets accept an abstract type, not a line code, as its first argument
return removeBrackets(`phon`, removeEmphasis(line))
if (options.emphasis === false) line = removeEmphasis(line)
return removeBrackets(`phon`, line)
}
11 changes: 6 additions & 5 deletions src/parseUtterance/parseTranscript.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@ import groupLines from '../utilities/groupLines.js'

/**
* Extracts, validates, and cleans the transcript lines from the lines hash
* @param {String} lineCode The line code for the transcript lines
* @param {Object} lines The lines hash
* @param {String} orthography The abbreviation to use for the transcript orthography if none is present
* @param {String} lineCode The line code for the transcript lines
* @param {Object} lines The lines hash
* @param {Object} options The options hash
* @param {String} options.orthography The abbreviation to use for the transcript orthography if none is present
* @return {String|Object}
*/
export default function parseTranscript(lineCode, lines, orthography) {
export default function parseTranscript(lineCode, lines, options) {

const data = groupLines(lineCode, lines)

if (typeof data === `string`) return { [orthography]: data }
if (typeof data === `string`) return { [options.orthography]: data }

return data || null

Expand Down
13 changes: 7 additions & 6 deletions src/parseUtterance/parseTranscription.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,24 @@ import removeEmphasis from '../utilities/removeEmphasis.js'

/**
* Extracts, validates, and cleans the transcription lines from the lines hash
* @param {String} lineCode The code to use for transcription lines
* @param {Object} lines The lines hash
* @param {String} orthography An abbreviation to use for the orthography if one is not specified
* @param {String} lineCode The code to use for transcription lines
* @param {Object} lines The lines hash
* @param {Object} options The options hash
* @param {String} options.orthography An abbreviation to use for the orthography if one is not specified
* @return {String|Object}
*/
export default function parseTranscription(lineCode, lines, orthography) {
export default function parseTranscription(lineCode, lines, options) {

let data = groupLines(lineCode, lines)

if (!data) return null

// NB: Do not use lineCode for removeBrackets here
// removeBrackets takes an abstract type, not a line code, as its first argument
data = removeEmphasis(data)
if (options.emphasis === false) data = removeEmphasis(data)
data = removeBrackets(`txn`, data)

if (typeof data === `string`) return { [orthography]: data }
if (typeof data === `string`) return { [options.orthography]: data }

return data

Expand Down
10 changes: 6 additions & 4 deletions src/parseUtterance/parseTranslation.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ import removeEmphasis from '../utilities/removeEmphasis.js'

/**
* Extracts, validates, and cleans the translation lines from the lines hash
* @param {String} lineCode The line code to use for translations lines
* @param {Object} lines The lines hash
* @param {String} lineCode The line code to use for translations lines
* @param {Object} lines The lines hash
* @param {Object} options The options hash
* @param {Boolean} options.emphasis Whether to retain emphasis in the output
* @return {String|Object}
*/
export default function parseTranslation(lineCode, lines) {
export default function parseTranslation(lineCode, lines, options) {
let data = groupLines(lineCode, lines)
if (!data) return null
data = removeEmphasis(data)
if (options.emphasis === false) data = removeEmphasis(data)
// NB: removeBrackets accepts an abstract line type, not a specific line code
// as its first argument
data = removeBrackets(`tln`, data)
Expand Down
33 changes: 18 additions & 15 deletions src/parseUtterance/parseWords.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,25 @@ import zip from '../utilities/js/zip.js'

/**
* Parses the word hash into a DLx Word object
* @param {Object} codes The hash of line codes
* @param {Object} data The word hash
* @param {String} orthography The orthography abbreviation to use if one is not specified
* @return {Object} Returns a DLx Word object
* @param {Object} codes The hash of line codes
* @param {Object} data The word hash
* @param {Object} options The options hash
* @param {Boolean} options.emphasis Whether to retain empahsis in the output
* @param {String} options.orthography The orthography abbreviation to use if one is not specified
* @return {Object} Returns a DLx Word object
*/
function parseWord(codes, data, orthography) {
function parseWord(codes, data, options) {

data = removeEmphasis(data) // eslint-disable-line no-param-reassign
if (options.emphasis === false) data = removeEmphasis(data) // eslint-disable-line no-param-reassign

let transcription = groupLines(codes.w, data) || ``
let analysis = groupLines(codes.m, data)
const gloss = groupLines(codes.gl, data)
const literal = groupLines(codes.wlt, data)
const morphemes = parseMorphemes(codes, data, orthography)
const morphemes = parseMorphemes(codes, data, options)

if (typeof transcription === `string`) transcription = { [orthography]: transcription }
if (typeof analysis === `string`) analysis = { [orthography]: analysis }
if (typeof transcription === `string`) transcription = { [options.orthography]: transcription }
if (typeof analysis === `string`) analysis = { [options.orthography]: analysis }

return {
transcription,
Expand Down Expand Up @@ -52,12 +54,13 @@ function tokenizeLine(string) {

/**
* Extracts word-specific lines from the lines hash and converts them into an array of DLx Word objects
* @param {Object} codes The line codes hash
* @param {Object} lines The lines hash
* @param {String} orthography The abbreviation to use for the orthography if one is not specified
* @return {Array} Returns an array of DLx Word objects
* @param {Object} codes The line codes hash
* @param {Object} lines The lines hash
* @param {Object} options The options hash
* @param {String} options.orthography The abbreviation to use for the orthography if one is not specified
* @return {Array} Returns an array of DLx Word objects
*/
export default function parseWords(codesHash, lines, orthography) {
export default function parseWords(codesHash, lines, options) {

const wordLineCodes = wordTypes.map(type => codesHash[type])
const wordLines = getLines(wordLineCodes, lines)
Expand All @@ -74,6 +77,6 @@ export default function parseWords(codesHash, lines, orthography) {
validateNumItems(wordsHash)

return zip(wordsHash)
.map(wordData => parseWord(codesHash, wordData, orthography))
.map(wordData => parseWord(codesHash, wordData, options))

}
33 changes: 25 additions & 8 deletions test/lines/glosses.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ describe(`glosses`, () => {

})


it(`tokenizes words and morphemes correctly`, () => {

const text = `
Expand Down Expand Up @@ -270,7 +270,7 @@ describe(`glosses`, () => {
expect(w4m.gloss).to.equal(`null`)

})


it(`populates both word and morpheme glosses`, () => {

Expand All @@ -289,18 +289,35 @@ describe(`glosses`, () => {

})

it(`is stripped of emphasis`, () => {
it(`retains emphasis`, function() {

const wordGloss = `waxt-qungu`
const withEmphasis = `*waxt*-qungu`
const withoutEmphasis = `waxt-qungu`

const text = `
\\m ${ wordGloss } qasi
\\gl *waxt*-qungu qasi
\\m ${ withoutEmphasis } qasi
\\gl ${ withEmphasis } qasi
`

const { utterances: [{ words: [word] }] } = convert(text)
const { utterances: [{ words: [word] }] } = convert(text, { emphasis: true })

expect(word.gloss).to.equal(withEmphasis)

})

it(`strips emphasis`, () => {

const withEmphasis = `*waxt*-qungu`
const withoutEmphasis = `waxt-qungu`

const text = `
\\m ${ withoutEmphasis } qasi
\\gl ${ withEmphasis } qasi
`

const { utterances: [{ words: [word] }] } = convert(text, { emphasis: false })

expect(word.gloss).to.equal(wordGloss)
expect(word.gloss).to.equal(withoutEmphasis)

})

Expand Down
Loading

0 comments on commit f4113d1

Please sign in to comment.