-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathblock_parser.dart
436 lines (358 loc) · 12.7 KB
/
block_parser.dart
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
// Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
/// The line contains only whitespace or is empty.
final _RE_EMPTY = const RegExp(@'^([ \t]*)$');
/// A series of `=` or `-` (on the next line) define setext-style headers.
final _RE_SETEXT = const RegExp(@'^((=+)|(-+))$');
/// Leading (and trailing) `#` define atx-style headers.
final _RE_HEADER = const RegExp(@'^(#{1,6})(.*?)#*$');
/// The line starts with `>` with one optional space after.
final _RE_BLOCKQUOTE = const RegExp(@'^[ ]{0,3}>[ ]?(.*)$');
/// A line indented four spaces. Used for code blocks and lists.
final _RE_INDENT = const RegExp(@'^(?: |\t)(.*)$');
/// Three or more hyphens, asterisks or underscores by themselves. Note that
/// a line like `----` is valid as both HR and SETEXT. In case of a tie,
/// SETEXT should win.
final _RE_HR = const RegExp(@'^[ ]{0,3}((-+[ ]{0,2}){3,}|' +
@'(_+[ ]{0,2}){3,}|' +
@'(\*+[ ]{0,2}){3,})$');
/// Really hacky way to detect block-level embedded HTML. Just looks for
/// "<somename".
final _RE_HTML = const RegExp(@'^<[ ]*\w+[ >]');
/// A line starting with one of these markers: `-`, `*`, `+`. May have up to
/// three leading spaces before the marker and any number of spaces or tabs
/// after.
final _RE_UL = const RegExp(@'^[ ]{0,3}[*+-][ \t]+(.*)$');
/// A line starting with a number like `123.`. May have up to three leading
/// spaces before the marker and any number of spaces or tabs after.
final _RE_OL = const RegExp(@'^[ ]{0,3}\d+\.[ \t]+(.*)$');
/// Maintains the internal state needed to parse a series of lines into blocks
/// of markdown suitable for further inline parsing.
class BlockParser {
final List<String> lines;
/// The markdown document this parser is parsing.
final Document document;
/// Index of the current line.
int pos;
BlockParser(this.lines, this.document)
: pos = 0;
/// Gets the current line.
String get current() => lines[pos];
/// Gets the line after the current one or `null` if there is none.
String get next() {
// Don't read past the end.
if (pos >= lines.length - 1) return null;
return lines[pos + 1];
}
void advance() {
pos++;
}
bool get isDone() => pos >= lines.length;
/// Gets whether or not the current line matches the given pattern.
bool matches(RegExp regex) {
if (isDone) return false;
return regex.firstMatch(current) != null;
}
/// Gets whether or not the current line matches the given pattern.
bool matchesNext(RegExp regex) {
if (next == null) return false;
return regex.firstMatch(next) != null;
}
}
class BlockSyntax {
/// Gets the collection of built-in block parsers. To turn a series of lines
/// into blocks, each of these will be tried in turn. Order matters here.
static List<BlockSyntax> get syntaxes() {
// Lazy initialize.
if (_syntaxes == null) {
_syntaxes = [
new EmptyBlockSyntax(),
new BlockHtmlSyntax(),
new SetextHeaderSyntax(),
new HeaderSyntax(),
new CodeBlockSyntax(),
new BlockquoteSyntax(),
new HorizontalRuleSyntax(),
new UnorderedListSyntax(),
new OrderedListSyntax(),
new ParagraphSyntax()
];
}
return _syntaxes;
}
static List<BlockSyntax> _syntaxes;
/// Gets the regex used to identify the beginning of this block, if any.
RegExp get pattern() => null;
bool get canEndBlock() => true;
bool canParse(BlockParser parser) {
return pattern.firstMatch(parser.current) != null;
}
abstract Node parse(BlockParser parser);
List<String> parseChildLines(BlockParser parser) {
// Grab all of the lines that form the blockquote, stripping off the ">".
final childLines = <String>[];
while (!parser.isDone) {
final match = pattern.firstMatch(parser.current);
if (match == null) break;
childLines.add(match[1]);
parser.advance();
}
return childLines;
}
/// Gets whether or not [parser]'s current line should end the previous block.
static bool isAtBlockEnd(BlockParser parser) {
if (parser.isDone) return true;
return syntaxes.some((s) => s.canParse(parser) && s.canEndBlock);
}
}
class EmptyBlockSyntax extends BlockSyntax {
RegExp get pattern() => _RE_EMPTY;
Node parse(BlockParser parser) {
parser.advance();
// Don't actually emit anything.
return null;
}
}
/// Parses setext-style headers.
class SetextHeaderSyntax extends BlockSyntax {
bool canParse(BlockParser parser) {
// Note: matches *next* line, not the current one. We're looking for the
// underlining after this line.
return parser.matchesNext(_RE_SETEXT);
}
Node parse(BlockParser parser) {
final match = _RE_SETEXT.firstMatch(parser.next);
final tag = (match[1][0] == '=') ? 'h1' : 'h2';
final contents = parser.document.parseInline(parser.current);
parser.advance();
parser.advance();
return new Element(tag, contents);
}
}
/// Parses atx-style headers: `## Header ##`.
class HeaderSyntax extends BlockSyntax {
RegExp get pattern() => _RE_HEADER;
Node parse(BlockParser parser) {
final match = pattern.firstMatch(parser.current);
parser.advance();
final level = match[1].length;
final contents = parser.document.parseInline(match[2].trim());
return new Element('h$level', contents);
}
}
/// Parses email-style blockquotes: `> quote`.
class BlockquoteSyntax extends BlockSyntax {
RegExp get pattern() => _RE_BLOCKQUOTE;
Node parse(BlockParser parser) {
final childLines = parseChildLines(parser);
// Recursively parse the contents of the blockquote.
final children = parser.document.parseLines(childLines);
return new Element('blockquote', children);
}
}
/// Parses preformatted code blocks that are indented four spaces.
class CodeBlockSyntax extends BlockSyntax {
RegExp get pattern() => _RE_INDENT;
Node parse(BlockParser parser) {
final childLines = parseChildLines(parser);
// The Markdown tests expect a trailing newline.
childLines.add('');
// Escape the code.
final escaped = escapeHtml(Strings.join(childLines, '\n'));
return new Element('pre', [new Element.text('code', escaped)]);
}
}
/// Parses horizontal rules like `---`, `_ _ _`, `* * *`, etc.
class HorizontalRuleSyntax extends BlockSyntax {
RegExp get pattern() => _RE_HR;
Node parse(BlockParser parser) {
final match = pattern.firstMatch(parser.current);
parser.advance();
return new Element.empty('hr');
}
}
/// Parses inline HTML at the block level. This differs from other markdown
/// implementations in several ways:
///
/// 1. This one is way way WAY simpler.
/// 2. All HTML tags at the block level will be treated as blocks. If you
/// start a paragraph with `<em>`, it will not wrap it in a `<p>` for you.
/// As soon as it sees something like HTML, it stops mucking with it until
/// it hits the next block.
/// 3. Absolutely no HTML parsing or validation is done. We're a markdown
/// parser not an HTML parser!
class BlockHtmlSyntax extends BlockSyntax {
RegExp get pattern() => _RE_HTML;
bool get canEndBlock() => false;
Node parse(BlockParser parser) {
final childLines = [];
// Eat until we hit a blank line.
while (!parser.isDone && !parser.matches(_RE_EMPTY)) {
childLines.add(parser.current);
parser.advance();
}
return new Text(Strings.join(childLines, '\n'));
}
}
class ListItem {
bool forceBlock = false;
final List<String> lines;
ListItem(this.lines);
}
/// Base class for both ordered and unordered lists.
class ListSyntax extends BlockSyntax {
bool get canEndBlock() => false;
abstract String get listTag();
Node parse(BlockParser parser) {
final items = <ListItem>[];
var childLines = <String>[];
endItem() {
if (childLines.length > 0) {
items.add(new ListItem(childLines));
childLines = <String>[];
}
}
var match;
tryMatch(RegExp pattern) {
match = pattern.firstMatch(parser.current);
return match != null;
}
bool afterEmpty = false;
while (!parser.isDone) {
if (tryMatch(_RE_EMPTY)) {
// Add a blank line to the current list item.
childLines.add('');
} else if (tryMatch(_RE_UL) || tryMatch(_RE_OL)) {
// End the current list item and start a new one.
endItem();
childLines.add(match[1]);
} else if (tryMatch(_RE_INDENT)) {
// Strip off indent and add to current item.
childLines.add(match[1]);
} else if (isAtBlockEnd(parser)) {
// Done with the list.
break;
} else {
// Anything else is paragraph text or other stuff that can be in a list
// item. However, if the previous item is a blank line, this means we're
// done with the list and are starting a new top-level paragraph.
if ((childLines.length > 0) && (childLines.last() == '')) break;
childLines.add(parser.current);
}
parser.advance();
}
endItem();
// Markdown, because it hates us, specifies two kinds of list items. If you
// have a list like:
//
// * one
// * two
//
// Then it will insert the conents of the lines directly in the <li>, like:
// <ul>
// <li>one</li>
// <li>two</li>
// <ul>
//
// If, however, there are blank lines between the items, each is wrapped in
// paragraphs:
//
// * one
//
// * two
//
// <ul>
// <li><p>one</p></li>
// <li><p>two</p></li>
// <ul>
//
// In other words, sometimes we parse the contents of a list item like a
// block, and sometimes line an inline. The rules our parser implements are:
//
// - If it has more than one line, it's a block.
// - If the line matches any block parser (BLOCKQUOTE, HEADER, HR, INDENT,
// UL, OL) it's a block. (This is for cases like "* > quote".)
// - If there was a blank line between this item and the previous one, it's
// a block.
// - If there was a blank line between this item and the next one, it's a
// block.
// - Otherwise, parse it as an inline.
// Remove any trailing empty lines and note which items are separated by
// empty lines. Do this before seeing which items are single-line so that
// trailing empty lines on the last item don't force it into being a block.
for (int i = 0; i < items.length; i++) {
for (int j = items[i].lines.length - 1; j > 0; j--) {
if (_RE_EMPTY.firstMatch(items[i].lines[j]) != null) {
// Found an empty line. Item and one after it are blocks.
if (i < items.length - 1) {
items[i].forceBlock = true;
items[i + 1].forceBlock = true;
}
items[i].lines.removeLast();
} else {
break;
}
}
}
// Convert the list items to Nodes.
final itemNodes = <Node>[];
for (final item in items) {
bool blockItem = item.forceBlock || (item.lines.length > 1);
// See if it matches some block parser.
final blocksInList = const [
_RE_BLOCKQUOTE,
_RE_HEADER,
_RE_HR,
_RE_INDENT,
_RE_UL,
_RE_OL
];
if (!blockItem) {
for (final pattern in blocksInList) {
if (pattern.firstMatch(item.lines[0]) != null) {
blockItem = true;
break;
}
}
}
// Parse the item as a block or inline.
if (blockItem) {
// Block list item.
final children = parser.document.parseLines(item.lines);
itemNodes.add(new Element('li', children));
} else {
// Raw list item.
final contents = parser.document.parseInline(item.lines[0]);
itemNodes.add(new Element('li', contents));
}
}
return new Element(listTag, itemNodes);
}
}
/// Parses unordered lists.
class UnorderedListSyntax extends ListSyntax {
RegExp get pattern() => _RE_UL;
String get listTag() => 'ul';
}
/// Parses ordered lists.
class OrderedListSyntax extends ListSyntax {
RegExp get pattern() => _RE_OL;
String get listTag() => 'ol';
}
/// Parses paragraphs of regular text.
class ParagraphSyntax extends BlockSyntax {
bool get canEndBlock() => false;
bool canParse(BlockParser parser) => true;
Node parse(BlockParser parser) {
final childLines = [];
// Eat until we hit something that ends a paragraph.
while (!isAtBlockEnd(parser)) {
childLines.add(parser.current);
parser.advance();
}
final contents = parser.document.parseInline(
Strings.join(childLines, '\n'));
return new Element('p', contents);
}
}