-
Notifications
You must be signed in to change notification settings - Fork 133
/
Copy pathN3Lexer.js
359 lines (322 loc) · 15 KB
/
N3Lexer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
// **N3Lexer** tokenizes N3 documents.
var fromCharCode = String.fromCharCode;
var immediately = typeof setImmediate === 'function' ? setImmediate :
function setImmediate(func) { setTimeout(func, 0); };
// Regular expression and replacement string to escape N3 strings.
// Note how we catch invalid unicode sequences separately (they will trigger an error).
var escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\[uU]|\\(.)/g;
var escapeReplacements = { '\\': '\\', "'": "'", '"': '"',
'n': '\n', 'r': '\r', 't': '\t', 'f': '\f', 'b': '\b',
'_': '_', '~': '~', '.': '.', '-': '-', '!': '!', '$': '$', '&': '&',
'(': '(', ')': ')', '*': '*', '+': '+', ',': ',', ';': ';', '=': '=',
'/': '/', '?': '?', '#': '#', '@': '@', '%': '%' };
var illegalIriChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;
// ## Constructor
function N3Lexer(options) {
if (!(this instanceof N3Lexer))
return new N3Lexer(options);
// In line mode (N-Triples or N-Quads), only simple features may be parsed
if (options && options.lineMode) {
// Don't tokenize special literals
this._tripleQuotedString = this._number = this._boolean = /$0^/;
// Swap the tokenize method for a restricted version
var self = this;
this._tokenize = this.tokenize;
this.tokenize = function (input, callback) {
this._tokenize(input, function (error, token) {
if (!error && /^(?:IRI|prefixed|literal|langcode|type|\.|eof)$/.test(token.type))
callback && callback(error, token);
else
callback && callback(error || self._syntaxError(token.type, callback = null));
});
};
}
}
N3Lexer.prototype = {
// ## Regular expressions
// It's slightly faster to have these as properties than as in-scope variables.
_iri: /^<((?:[^>\\]|\\[uU])+)>/, // IRI with escape sequences; needs sanity check after unescaping
_unescapedIri: /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>/, // IRI without escape sequences; no unescaping
_unescapedString: /^"[^"\\]+"(?=[^"\\])/, // non-empty string without escape sequences
_singleQuotedString: /^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^'[^'\\]*(?:\\.[^'\\]*)*'(?=[^'\\])/,
_tripleQuotedString: /^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^''('[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*')''/,
_langcode: /^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i,
_prefix: /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:(?=[#\s<])/,
_prefixed: /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?=\.?[,;\s#()\[\]\{\}"'<])/,
_blank: /^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?=\.?[,;:\s#()\[\]\{\}"'<])/,
_number: /^[\-+]?(?:\d+\.?\d*([eE](?:[\-\+])?\d+)|\d*\.?\d+)(?=[.,;:\s#()\[\]\{\}"'<])/,
_boolean: /^(?:true|false)(?=[.,;:\s#()\[\]\{\}"'<])/,
_keyword: /^@[a-z]+(?=[\s#<:])/,
_sparqlKeyword: /^(?:PREFIX|BASE|GRAPH)(?=[\s#<:])/i,
_shortPredicates: /^a(?=\s+|<)/,
_newline: /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/,
_whitespace: /^[ \t]+/,
_endOfFile: /^(?:#[^\n\r]*)?$/,
// ## Private methods
// ### `_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback.
_tokenizeToEnd: function (callback, inputFinished) {
// Continue parsing as far as possible; the loop will return eventually.
var input = this._input;
while (true) {
// Count and skip whitespace lines.
var whiteSpaceMatch;
while (whiteSpaceMatch = this._newline.exec(input))
input = input.substr(whiteSpaceMatch[0].length, input.length), this._line++;
// Skip whitespace on current line.
if (whiteSpaceMatch = this._whitespace.exec(input))
input = input.substr(whiteSpaceMatch[0].length, input.length);
// Stop for now if we're at the end.
if (this._endOfFile.test(input)) {
// If the input is finished, emit EOF.
if (inputFinished)
callback(input = null, { line: this._line, type: 'eof', value: '', prefix: '' });
return this._input = input;
}
// Look for specific token types based on the first character.
var line = this._line, type = '', value = '', prefix = '',
firstChar = input[0], match = null, matchLength = 0, unescaped, inconclusive = false;
switch (firstChar) {
case '^':
// Try to match a type.
if (input.length === 1) break;
else if (input[1] !== '^') return reportSyntaxError(this);
this._prevTokenType = '^';
// Move to type IRI or prefixed name.
input = input.substr(2);
if (input[0] !== '<') {
inconclusive = true;
break;
}
// Fall through in case the type is an IRI.
case '<':
// Try to find a full IRI without escape sequences.
if (match = this._unescapedIri.exec(input))
type = 'IRI', value = match[1];
// Try to find a full IRI with escape sequences.
else if (match = this._iri.exec(input)) {
unescaped = this._unescape(match[1]);
if (unescaped === null || illegalIriChars.test(unescaped))
return reportSyntaxError(this);
type = 'IRI', value = unescaped;
}
break;
case '_':
// Try to find a blank node. Since it can contain (but not end with) a dot,
// we always need a non-dot character before deciding it is a prefixed name.
// Therefore, try inserting a space if we're at the end of the input.
if ((match = this._blank.exec(input)) ||
inputFinished && (match = this._blank.exec(input + ' ')))
type = 'prefixed', prefix = '_', value = match[1];
break;
case '"':
case "'":
// Try to find a non-empty double-quoted literal without escape sequences.
if (match = this._unescapedString.exec(input))
type = 'literal', value = match[0];
// Try to find any other literal wrapped in a pair of single or double quotes.
else if (match = this._singleQuotedString.exec(input)) {
unescaped = this._unescape(match[0]);
if (unescaped === null)
return reportSyntaxError(this);
type = 'literal', value = unescaped.replace(/^'|'$/g, '"');
}
// Try to find a literal wrapped in three pairs of single or double quotes.
else if (match = this._tripleQuotedString.exec(input)) {
unescaped = match[1] || match[2];
// Count the newlines and advance line counter.
this._line += unescaped.split(/\r\n|\r|\n/).length - 1;
unescaped = this._unescape(unescaped);
if (unescaped === null)
return reportSyntaxError(this);
type = 'literal', value = unescaped.replace(/^'|'$/g, '"');
}
break;
case '@':
// Try to find a language code.
if (this._prevTokenType === 'literal' && (match = this._langcode.exec(input)))
type = 'langcode', value = match[1];
// Try to find a keyword.
else if (match = this._keyword.exec(input))
type = match[0];
break;
case '.':
// Try to find a dot as punctuation.
if (input.length === 1 ? inputFinished : (input[1] < '0' || input[1] > '9')) {
type = '.';
matchLength = 1;
break;
}
// Fall through to numerical case (could be a decimal dot).
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '+':
case '-':
// Try to find a number.
if (match = this._number.exec(input)) {
type = 'literal';
value = '"' + match[0] + '"^^http://www.w3.org/2001/XMLSchema#' +
(match[1] ? 'double' : (/^[+\-]?\d+$/.test(match[0]) ? 'integer' : 'decimal'));
}
break;
case 'B':
case 'b':
case 'p':
case 'P':
case 'G':
case 'g':
// Try to find a SPARQL-style keyword.
if (match = this._sparqlKeyword.exec(input))
type = match[0].toUpperCase();
else
inconclusive = true;
break;
case 'f':
case 't':
// Try to match a boolean.
if (match = this._boolean.exec(input))
type = 'literal', value = '"' + match[0] + '"^^http://www.w3.org/2001/XMLSchema#boolean';
else
inconclusive = true;
break;
case 'a':
// Try to find an abbreviated predicate.
if (match = this._shortPredicates.exec(input))
type = 'abbreviation', value = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
else
inconclusive = true;
break;
case ',':
case ';':
case '[':
case ']':
case '(':
case ')':
case '{':
case '}':
// The next token is punctuation
matchLength = 1;
type = firstChar;
break;
default:
inconclusive = true;
}
// Some first characters do not allow an immediate decision, so inspect more.
if (inconclusive) {
// Try to find a prefix.
if ((this._prevTokenType === '@prefix' || this._prevTokenType === 'PREFIX') &&
(match = this._prefix.exec(input)))
type = 'prefix', value = match[1] || '';
// Try to find a prefixed name. Since it can contain (but not end with) a dot,
// we always need a non-dot character before deciding it is a prefixed name.
// Therefore, try inserting a space if we're at the end of the input.
else if ((match = this._prefixed.exec(input)) ||
inputFinished && (match = this._prefixed.exec(input + ' ')))
type = 'prefixed', prefix = match[1] || '', value = this._unescape(match[2]);
}
// A type token is special: it can only be emitted after an IRI or prefixed name is read.
if (this._prevTokenType === '^')
type = (type === 'IRI' || type === 'prefixed') ? 'type' : '';
// What if nothing of the above was found?
if (!type) {
// We could be in streaming mode, and then we just wait for more input to arrive.
// Otherwise, a syntax error has occurred in the input.
// One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).
if (inputFinished || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
return reportSyntaxError(this);
else
return this._input = input;
}
// Emit the parsed token.
callback(null, { line: line, type: type, value: value, prefix: prefix });
this._prevTokenType = type;
// Advance to next part to tokenize.
input = input.substr(matchLength || match[0].length, input.length);
}
// Signals the syntax error through the callback
function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
},
// ### `_unescape` replaces N3 escape codes by their corresponding characters.
_unescape: function (item) {
try {
return item.replace(escapeSequence, function (sequence, unicode4, unicode8, escapedChar) {
var charCode;
if (unicode4) {
charCode = parseInt(unicode4, 16);
if (isNaN(charCode)) throw new Error(); // can never happen (regex), but helps performance
return fromCharCode(charCode);
}
else if (unicode8) {
charCode = parseInt(unicode8, 16);
if (isNaN(charCode)) throw new Error(); // can never happen (regex), but helps performance
if (charCode <= 0xFFFF) return fromCharCode(charCode);
return fromCharCode(0xD800 + ((charCode -= 0x10000) / 0x400), 0xDC00 + (charCode & 0x3FF));
}
else {
var replacement = escapeReplacements[escapedChar];
if (!replacement)
throw new Error();
return replacement;
}
});
}
catch (error) { return null; }
},
// ### `_syntaxError` creates a syntax error for the given issue
_syntaxError: function (issue) {
this._input = null;
return new Error('Syntax error: unexpected "' + issue + '" on line ' + this._line + '.');
},
// ## Public methods
// ### `tokenize` starts the transformation of an N3 document into an array of tokens.
// The input can be a string or a stream.
tokenize: function (input, callback) {
var self = this;
this._line = 1;
// If the input is a string, continuously emit tokens through the callback until the end.
if (typeof input === 'string') {
this._input = input;
immediately(function () { self._tokenizeToEnd(callback, true); });
}
// Otherwise, the input will be streamed.
else {
this._input = '';
// If no input was given, it will be streamed through `addChunk` and ended with `end`
if (!input || typeof input === 'function') {
this.addChunk = addChunk;
this.end = end;
if (!callback)
callback = input;
}
// Otherwise, the input itself must be a stream
else {
if (typeof input.setEncoding === 'function')
input.setEncoding('utf8');
input.on('data', addChunk);
input.on('end', end);
}
}
// Adds the data chunk to the buffer and parses as far as possible
function addChunk(data) {
if (self._input !== null) {
self._input += data;
self._tokenizeToEnd(callback, false);
}
}
// Parses until the end
function end() {
if (self._input !== null)
self._tokenizeToEnd(callback, true);
}
},
};
// ## Exports
// Export the `N3Lexer` class as a whole.
module.exports = N3Lexer;