-
Notifications
You must be signed in to change notification settings - Fork 0
/
descriptionParser.py
285 lines (254 loc) · 14.1 KB
/
descriptionParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# -*-coding:Latin-1 -*
import re
import numpy as np
from logPrint import logPrint
class descriptionParser():
def __init__(self, descriptionList):
# list is the only accepted type for descriptionList
self.descriptionLinesList = descriptionList
self.chaptersMatrix = np.empty((500, 2), dtype=object)
# Constants
MAX_SIZE_OF_MATRIX = 100
MAX_ATTEMPTS_NB_TO_PARSE_TIME_AND_TITLE = 2
# Parse the description field to detect the time and title of each chapter
# Return if it has found chapters
def parse(self):
returnStatus = self._findLineZero()
if returnStatus:
# If 0:00 has been found, parse the rest of the chapters
self._parseChapterList()
returnStatus = self._buildPublicChaptersMatrix()
return returnStatus
def getChaptersMatrix(self):
return self.publicChaptersMatrix
# Check if the character as argument is 1, 2, 3, 4, 5, 6, 7, 8 or 9
def _checkIfCharIsBetweenOneAndNine(self, char):
if char >= '1' and char <= '9':
return True
else:
return False
# Double check if the regex applied to find 0:00 has really found 0:00 and not 10:00 for instance
# Detect also as a positive 00:00 or 0:00:00 for instance
def _checkIfDetectedZeroIsReallyZero(self, lineString, idxOfLine):
# logPrint.printDebug(
# "_checkIfDetectedZeroIsReallyZero: #"+str(idxOfLine)+": "+str(lineString[idxOfLine]))
# Check of underflow
if (idxOfLine != 0):
if (self._checkIfCharIsBetweenOneAndNine(lineString[idxOfLine])):
# No need to go before, the time is bigger than 0:00 (e.g. 20:00)
returnValue = False
elif ((lineString[idxOfLine] == '0') or (lineString[idxOfLine] == ':')):
# There is a 0 before 0:00, go to check before
returnValue = self._checkIfDetectedZeroIsReallyZero(
lineString, idxOfLine-1)
else:
# All characters checked from here were 0 or :
# The one read here is not part of the time
# We can say for sure the time detected is 0:00
returnValue = True
# Beginning of the line
else:
if (self._checkIfCharIsBetweenOneAndNine(lineString[idxOfLine])):
# No need to go before, the time is bigger than 0:00 (e.g. 20:00)
returnValue = False
elif ((lineString[idxOfLine] == '0') or (lineString[idxOfLine] == ':')):
# The time is 0:00 and there is not character left
returnValue = True
else:
# All characters checked from here were 0 or :
# The one read here is not part of the time
# We can say for sure the time detected is 0:00
returnValue = True
return returnValue
# From regex splitting of the line containing 0:00, analyze the pattern to then capture all the chapters in a matrix
# Patterns description:
# -1:
# Not initialized/incorrect
#
# 0:
# "<something_before_or_not><time><title_with_delim_charac>"
#
# 1:
# "<title_with_delim_charac><time>"
#
def _analyzeChaptersPattern(self, splittedLineList, idxOfZeroInSplittedLine):
patternId = -1
if (idxOfZeroInSplittedLine == 0):
# The line begins with "0:00"
patternId = 0
elif (idxOfZeroInSplittedLine == 1):
flagAllCharsAreZeroOrColon = True
# We check if the word before the one detected is part of the time or not
for char in splittedLineList[0]:
flagAllCharsAreZeroOrColon = (
flagAllCharsAreZeroOrColon and ((char == ':') or (char == '0')))
if flagAllCharsAreZeroOrColon:
# The line begins with something like "0:00:00", or less complex
patternId = 0
else:
if (len(splittedLineList[0]) < 3):
# The word before is too short to contain the title
patternId = 0
else:
# The word before may be the title
patternId = 1
else:
# Unknown pattern. The script will exit soon later
patternId = -1
return patternId
# Apply regex and double check to look for line containing the chapter 0 at time 0:00
# Return if it has found chapters
def _findLineZero(self):
lineNb = 0
# Flag to stop the search for 0:00
isLineAtTimeZero = False
logPrint.printLog("Parsing the description field")
for line in self.descriptionLinesList:
if (not isLineAtTimeZero):
# Not find yet: perform the search
idxReturnedFromFind = line.find("0:00")
if idxReturnedFromFind != -1:
# "0:00" found. We store the line number and the character index
if idxReturnedFromFind > 0:
# Consolidate: check again if idxReturnedFromFind>0, i.e. if it is really 00:00 and not for example 10:00
isLineAtTimeZero = self._checkIfDetectedZeroIsReallyZero(
line, idxReturnedFromFind)
else:
# 0:00 at index 0 is for sure a time zero
isLineAtTimeZero = True
if isLineAtTimeZero:
# self.idxCharZeroZero = idxReturnedFromFind
self.lineNbZeroZero = lineNb
lineNb = lineNb+1
if not isLineAtTimeZero:
logPrint.printError("0:00 not found.")
return isLineAtTimeZero
# Once the beginning of the chapter list found, parse it entirely and store it in a matrix
def _parseChapterList(self):
idxSplitted = 0
idxOfZeroInSplittedLine = -1
line = self.descriptionLinesList[self.lineNbZeroZero]
splittedLineList = re.split(r'([0-9:]+:[0-9:]+)', line)
for splittedWord in splittedLineList:
if re.search(r'[0-9:]+:[0-9:]+', splittedWord):
idxOfZeroInSplittedLine = idxSplitted
idxSplitted = idxSplitted+1
# Analyze pattern
self.patternId = self._analyzeChaptersPattern(
splittedLineList, idxOfZeroInSplittedLine)
logPrint.printLog("Pattern detected: "+str(self.patternId))
idxMatrix = 0
# To avoid false end-of-chapters due to problems of formatting in the description, another attempt is given to each non-match
numberOfRemainingAttemptsToParseEachLine = self.MAX_ATTEMPTS_NB_TO_PARSE_TIME_AND_TITLE
logPrint.printLog("While parsing the chapters, a maximum of "+str(
self.MAX_ATTEMPTS_NB_TO_PARSE_TIME_AND_TITLE)+" not matching line(s) is accepted.")
offsetForIdxMatrixBecauseOfFalseErrors = 0
# Max length of the matrix
while (idxMatrix < self.MAX_SIZE_OF_MATRIX):
# Avoid overflow
if((self.lineNbZeroZero + idxMatrix + offsetForIdxMatrixBecauseOfFalseErrors < len(self.descriptionLinesList)) and numberOfRemainingAttemptsToParseEachLine > 0):
line = self.descriptionLinesList[self.lineNbZeroZero +
idxMatrix + offsetForIdxMatrixBecauseOfFalseErrors]
if re.search('[0-9:]+:[0-9:]+', line):
# Reset for the next formatting error found
numberOfRemainingAttemptsToParseEachLine = self.MAX_ATTEMPTS_NB_TO_PARSE_TIME_AND_TITLE
# FIXME Does not work for "[2:20] the past inside the present"
splittedLine = re.split(r'([0-9:]+:[0-9:]+)', line)
# Too specific, commented and will be deleted after testing
# offsetForSplittedLineReading = 0
# if splittedLine[0] == "":
# # re.split created an empty cell at the beginning, ignore it
# offsetForSplittedLineReading = 1
# Pattern 0 could include the case where time is not the first word, the shift to do is idxOfZeroInSplittedLine
# We here take thy hypothesis that the shift of the 0:00 line is the same for every line
if self.patternId == 0:
time = splittedLine[0 + idxOfZeroInSplittedLine]
titleAndDelim = splittedLine[1 +
idxOfZeroInSplittedLine]
elif self.patternId == 1:
time = splittedLine[1]
titleAndDelim = splittedLine[0]
else:
logPrint.printError(
"Chapters pattern not found. Exiting.")
exit(-4)
logPrint.printLog("Matching line found, line "+str(self.lineNbZeroZero +
idxMatrix + offsetForIdxMatrixBecauseOfFalseErrors)+", time "+time)
# The title is stripped to not store useless characters in the beginning and in the end
title = titleAndDelim.strip(" -_[]#:")
# Strip remaining parenthesis around timestamp e.g. "(25:32) title" without cutting the included parenthesis of one title
# e.g. "(25:32) title (featuring Pitbull)"
title = title.lstrip(") ")
title = title.rstrip("( ")
self.chaptersMatrix[idxMatrix, 0] = time
self.chaptersMatrix[idxMatrix, 1] = title
# Shift the index only if we have found something to avoid holes because of formatting errors
idxMatrix = idxMatrix+1
else:
if numberOfRemainingAttemptsToParseEachLine > 0:
# Second chance
logPrint.printLog("Not matching line found, line "+str(self.lineNbZeroZero +
idxMatrix + offsetForIdxMatrixBecauseOfFalseErrors)+", another chance is given.")
numberOfRemainingAttemptsToParseEachLine = numberOfRemainingAttemptsToParseEachLine - 1
offsetForIdxMatrixBecauseOfFalseErrors = offsetForIdxMatrixBecauseOfFalseErrors + 1
else:
# No more time is detected and the formatting error margin is consumed, the chapter list is over
logPrint.printLog("No more matching line found, line "+str(self.lineNbZeroZero +
idxMatrix + offsetForIdxMatrixBecauseOfFalseErrors)+", the chapter list parsing is over.")
self.chaptersMatrixSize = idxMatrix
# End the while loop
idxMatrix = self.MAX_SIZE_OF_MATRIX
logPrint.printDebug("chaptersMatrix, " +
str(self.chaptersMatrixSize)+" lines: ")
for idxMatrix in range(self.chaptersMatrixSize):
logPrint.printDebug("|" +
self.chaptersMatrix[idxMatrix, 0]+"|"+self.chaptersMatrix[idxMatrix, 1]+"|")
# Convert string time ("1:12") into an integer, the number of seconds (72)
def _stringTimeToIntegerTimeInSeconds(self, stringTime):
integerTime = 0
# Multiplier if we are in seconds, minutes or hours
unitMultiplier = 1
# Multiplier according to the position of the figures in the number
decimalMultiplier = 1
for idxCharForward in range(len(stringTime)):
# Browse the string in reverse order
idxChar = len(stringTime)-idxCharForward - 1
char = stringTime[idxChar]
if (char >= "0" and char <= "9"):
figure = int(char)
integerTime = integerTime + \
(unitMultiplier * decimalMultiplier * figure)
# For the next step (if it is a figure and not ":" or out of range), add a power of 10
# Do not change the unit
decimalMultiplier = 10*decimalMultiplier
else:
if (char == ":"):
# Reset decimalMultiplier for next figure
decimalMultiplier = 1
# Unit will change on next figure
# Seconds to minutes and minutes to hours have the same multiplier
unitMultiplier = 60*unitMultiplier
else:
logPrint.printError(
char + " is not a figure or a comma, it is not possible to convert the time "+stringTime+" in seconds. Exiting.")
exit(-6)
return integerTime
# From chapterMatrix, which is private and not ready to use in this format, create another numpy matrix
# where time in the first column are integers
def _buildPublicChaptersMatrix(self):
returnStatus = True
self.publicChaptersMatrix = np.empty(
(self.chaptersMatrixSize, 2), dtype=object)
previousTimeInSeconds = -1
for idxMatrix in range(self.chaptersMatrixSize):
timeinSeconds = self._stringTimeToIntegerTimeInSeconds(
self.chaptersMatrix[idxMatrix, 0])
if timeinSeconds < previousTimeInSeconds:
# Something went wrong and the matrix is not in crescent order
logPrint.printError(
self.chaptersMatrix[idxMatrix, 0] + " is smaller than the previous time, "+self.chaptersMatrix[idxMatrix-1, 0]+", the matrix is not ordered in a crescent order.")
returnStatus = False
self.publicChaptersMatrix[idxMatrix, 0] = timeinSeconds
self.publicChaptersMatrix[idxMatrix,
1] = self.chaptersMatrix[idxMatrix, 1]
return returnStatus