Skip to content

Commit

Permalink
Add support to remove extra spaces when combining segment body
Browse files Browse the repository at this point in the history
Resolves #22
  • Loading branch information
stevencrader committed Jun 9, 2023
1 parent 0b8d878 commit a8ce604
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 5 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "transcriptator",
"version": "1.1.0",
"version": "1.1.2+beta.0",
"packageManager": "yarn@3.4.1",
"description": "Library for converting the various transcript file formats to a common format.",
"main": "index.ts",
Expand Down
18 changes: 16 additions & 2 deletions src/segments.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@ import { DEFAULT_COMBINE_SEGMENTS_LENGTH, Segment } from "./types"
* Regular Expression for detecting punctuation that should not be prefixed with a space
*/
const PATTERN_PUNCTUATIONS = /^ *[.,?!}\]>) *$]/
/**
* Regular Expression for detecting space characters at the end of a string
*/
const PATTERN_TRAILING_SPACE = /^ *$/

/**
* Remove any trailing space characters from data
*
* @param data text to trim
* @returns text with any trailing space character removed
*/
const trimEndSpace = (data: string): string => {
return data.replace(PATTERN_TRAILING_SPACE, "")
}

/**
* Append `addition` to `body` with the character(s) specified.
Expand All @@ -23,9 +37,9 @@ const joinBody = (body: string, addition: string, separator: string = undefined)
if (PATTERN_PUNCTUATIONS.exec(addition)) {
separatorToUse = ""
}
return `${body}${separatorToUse}${addition}`
return `${trimEndSpace(body)}${separatorToUse}${trimEndSpace(addition)}`
}
return addition
return trimEndSpace(addition)
}

/**
Expand Down
13 changes: 11 additions & 2 deletions test/json.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { describe, expect, test } from "@jest/globals"
import { IOptions, Options, Segment } from "../src"
import { parseJSON } from "../src/formats/json"

import { readFile, TestFiles } from "./test_utils"
import { readFile, saveSegmentsToFile, TestFiles } from "./test_utils"

describe("JSON formats test", () => {
test.each<{
Expand Down Expand Up @@ -190,11 +190,20 @@ describe("Parse JSON file data", () => {
},
id: "Podnews Weekly Review 2023-05-05, combine speaker",
},
])("Parse JSON File ($id)", ({ filePath, expectedFilePath, options }) => {
{
filePath: TestFiles.TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02,
expectedFilePath: TestFiles.TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02_OUTPUT,
options: {
combineSpeaker: true,
},
id: "Podnews Weekly Review 2023-06-02, extra space",
},
])("Parse JSON File ($id)", ({ filePath, expectedFilePath, options, id }) => {
const data = readFile(filePath)
const expectedJSONData = JSON.parse(readFile(expectedFilePath))
Options.setOptions(options)
const segments = parseJSON(data)
saveSegmentsToFile(segments, `out_json_${id}.json`)
expect(segments).toEqual(expectedJSONData.segments)
})
})
1 change: 1 addition & 0 deletions test/test_files/podnews_weekly_review_2023-06-02.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"version":"1.0.0","segments":[{"speaker":"Speaker 1","startTime":0.221,"endTime":0.361,"body":"It's"},{"speaker":"Speaker 1","startTime":0.361,"endTime":0.361,"body":" "},{"speaker":"Speaker 1","startTime":0.462,"endTime":0.883,"body":"Friday"},{"speaker":"Speaker 1","startTime":0.883,"endTime":0.883,"body":","},{"speaker":"Speaker 1","startTime":0.883,"endTime":0.883,"body":" "},{"speaker":"Speaker 1","startTime":0.963,"endTime":1.104,"body":"the"},{"speaker":"Speaker 1","startTime":1.104,"endTime":1.104,"body":" "},{"speaker":"Speaker 1","startTime":1.264,"endTime":1.605,"body":"2nd"},{"speaker":"Speaker 1","startTime":1.605,"endTime":1.605,"body":" "},{"speaker":"Speaker 1","startTime":1.625,"endTime":1.665,"body":"of"},{"speaker":"Speaker 1","startTime":1.665,"endTime":1.665,"body":" "},{"speaker":"Speaker 1","startTime":1.726,"endTime":1.987,"body":"June"},{"speaker":"Speaker 1","startTime":1.987,"endTime":1.987,"body":" "},{"speaker":"Speaker 1","startTime":2.007,"endTime":3.09,"body":"2023"},{"speaker":"Speaker 1","startTime":3.09,"endTime":3.09,"body":"."},{"speaker":"Speaker 1","startTime":3.09,"endTime":3.09,"body":" "},{"speaker":"Speaker 2","startTime":4.34,"endTime":4.44,"body":"The"},{"speaker":"Speaker 2","startTime":4.44,"endTime":4.44,"body":" "},{"speaker":"Speaker 2","startTime":4.521,"endTime":4.881,"body":"last"},{"speaker":"Speaker 2","startTime":4.881,"endTime":4.881,"body":" "},{"speaker":"Speaker 2","startTime":4.961,"endTime":5.262,"body":"word"},{"speaker":"Speaker 2","startTime":5.262,"endTime":5.262,"body":" "},{"speaker":"Speaker 2","startTime":5.382,"endTime":5.462,"body":"in"},{"speaker":"Speaker 2","startTime":5.462,"endTime":5.462,"body":" "},{"speaker":"Speaker 2","startTime":5.542,"endTime":6.224,"body":"podcasting"},{"speaker":"Speaker 2","startTime":6.224,"endTime":6.224,"body":" "},{"speaker":"Speaker 2","startTime":6.264,"endTime":6.604,"body":"news"},{"speaker":"Speaker 2","startTime":6.604,"endTime":6.604,"body":"."},{"speaker":"Speaker 2","startTime":6.604,"endTime":6.604,"body":" "},{"speaker":"Speaker 2","startTime":7.186,"endTime":7.506,"body":"This"},{"speaker":"Speaker 2","startTime":7.506,"endTime":7.506,"body":" "},{"speaker":"Speaker 2","startTime":7.887,"endTime":7.967,"body":"is"},{"speaker":"Speaker 2","startTime":7.967,"endTime":7.967,"body":" "},{"speaker":"Speaker 2","startTime":8.027,"endTime":8.087,"body":"the"},{"speaker":"Speaker 2","startTime":8.087,"endTime":8.087,"body":" "},{"speaker":"Speaker 2","startTime":8.147,"endTime":8.408,"body":"Pod"},{"speaker":"Speaker 2","startTime":8.408,"endTime":8.408,"body":" "},{"speaker":"Speaker 2","startTime":8.468,"endTime":8.729,"body":"News"},{"speaker":"Speaker 2","startTime":8.729,"endTime":8.729,"body":" "}]}
20 changes: 20 additions & 0 deletions test/test_files/podnews_weekly_review_2023-06-02_json_parsed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"segments": [
{
"startTime": 0.221,
"startTimeFormatted": "00:00:00.221",
"endTime": 3.09,
"endTimeFormatted": "00:00:03.090",
"speaker": "Speaker 1",
"body": "It's Friday, the 2nd of June 2023."
},
{
"startTime": 4.34,
"startTimeFormatted": "00:00:04.340",
"endTime": 8.729,
"endTimeFormatted": "00:00:08.729",
"speaker": "Speaker 2",
"body": "The last word in podcasting news. This is the Pod News"
}
]
}
2 changes: 2 additions & 0 deletions test/test_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ export enum TestFiles {
TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05 = "podnews_weekly_review_2023-05-05.json",
TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05_OUTPUT = "podnews_weekly_review_2023-05-05_json_parsed.json",
TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_05_05_COMBINE_SPEAKER_OUTPUT = "podnews_weekly_review_2023-05-05_combine_speaker_json_parsed.json",
TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02 = "podnews_weekly_review_2023-06-02.json",
TRANSCRIPT_JSON_PODNEWS_WEEKLY_REVIEW_2023_06_02_OUTPUT = "podnews_weekly_review_2023-06-02_json_parsed.json",
TRANSCRIPT_JSON_BUZZCAST = "buzzcast.json",
TRANSCRIPT_JSON_BUZZCAST_OUTPUT = "buzzcast_json_parsed.json",
TRANSCRIPT_JSON_BUZZCAST_COMBINE_EQUAL_TIME_OUTPUT = "buzzcast_json_combine_equal_time_parsed.json",
Expand Down

0 comments on commit a8ce604

Please sign in to comment.