-
Notifications
You must be signed in to change notification settings - Fork 29.6k
/
jsregexp.cc
6879 lines (6124 loc) Β· 246 KB
/
jsregexp.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/jsregexp.h"
#include <memory>
#include "src/base/platform/platform.h"
#include "src/compilation-cache.h"
#include "src/elements.h"
#include "src/execution.h"
#include "src/factory.h"
#include "src/isolate-inl.h"
#include "src/messages.h"
#include "src/ostreams.h"
#include "src/regexp/interpreter-irregexp.h"
#include "src/regexp/jsregexp-inl.h"
#include "src/regexp/regexp-macro-assembler-irregexp.h"
#include "src/regexp/regexp-macro-assembler-tracer.h"
#include "src/regexp/regexp-macro-assembler.h"
#include "src/regexp/regexp-parser.h"
#include "src/regexp/regexp-stack.h"
#include "src/runtime/runtime.h"
#include "src/splay-tree-inl.h"
#include "src/string-search.h"
#include "src/unicode-decoder.h"
#ifdef V8_I18N_SUPPORT
#include "unicode/uset.h"
#include "unicode/utypes.h"
#endif // V8_I18N_SUPPORT
#ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32
#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
#elif V8_TARGET_ARCH_X64
#include "src/regexp/x64/regexp-macro-assembler-x64.h"
#elif V8_TARGET_ARCH_ARM64
#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
#elif V8_TARGET_ARCH_ARM
#include "src/regexp/arm/regexp-macro-assembler-arm.h"
#elif V8_TARGET_ARCH_PPC
#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
#elif V8_TARGET_ARCH_S390
#include "src/regexp/s390/regexp-macro-assembler-s390.h"
#elif V8_TARGET_ARCH_MIPS
#include "src/regexp/mips/regexp-macro-assembler-mips.h"
#elif V8_TARGET_ARCH_MIPS64
#include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
#elif V8_TARGET_ARCH_X87
#include "src/regexp/x87/regexp-macro-assembler-x87.h"
#else
#error Unsupported target architecture.
#endif
#endif
namespace v8 {
namespace internal {
MUST_USE_RESULT
static inline MaybeHandle<Object> ThrowRegExpException(
Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
Isolate* isolate = re->GetIsolate();
THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
pattern, error_text),
Object);
}
inline void ThrowRegExpException(Handle<JSRegExp> re,
Handle<String> error_text) {
USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
}
ContainedInLattice AddRange(ContainedInLattice containment,
const int* ranges,
int ranges_length,
Interval new_range) {
DCHECK((ranges_length & 1) == 1);
DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
if (containment == kLatticeUnknown) return containment;
bool inside = false;
int last = 0;
for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
// Consider the range from last to ranges[i].
// We haven't got to the new range yet.
if (ranges[i] <= new_range.from()) continue;
// New range is wholly inside last-ranges[i]. Note that new_range.to() is
// inclusive, but the values in ranges are not.
if (last <= new_range.from() && new_range.to() < ranges[i]) {
return Combine(containment, inside ? kLatticeIn : kLatticeOut);
}
return kLatticeUnknown;
}
return containment;
}
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
const int kPatternTooShortForBoyerMoore = 2;
// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
if (length <= kPatternTooShortForBoyerMoore) return false;
const int kMod = 128;
bool character_found[kMod];
int different = 0;
memset(&character_found[0], 0, sizeof(character_found));
for (int i = 0; i < length; i++) {
int ch = (pattern->Get(i) & (kMod - 1));
if (!character_found[ch]) {
character_found[ch] = true;
different++;
// We declare a regexp low-alphabet if it has at least 3 times as many
// characters as it has different characters.
if (different * 3 > length) return false;
}
}
return true;
}
// Generic RegExp methods. Dispatches to implementation specific methods.
MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags) {
Isolate* isolate = re->GetIsolate();
Zone zone(isolate->allocator());
CompilationCache* compilation_cache = isolate->compilation_cache();
MaybeHandle<FixedArray> maybe_cached =
compilation_cache->LookupRegExp(pattern, flags);
Handle<FixedArray> cached;
bool in_cache = maybe_cached.ToHandle(&cached);
LOG(isolate, RegExpCompileEvent(re, in_cache));
Handle<Object> result;
if (in_cache) {
re->set_data(*cached);
return re;
}
pattern = String::Flatten(pattern);
PostponeInterruptsScope postpone(isolate);
RegExpCompileData parse_result;
FlatStringReader reader(isolate, pattern);
if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
&parse_result)) {
// Throw an exception if we fail to parse the pattern.
return ThrowRegExpException(re, pattern, parse_result.error);
}
bool has_been_compiled = false;
if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
// Parse-tree is a single atom that is equal to the pattern.
AtomCompile(re, pattern, flags, pattern);
has_been_compiled = true;
} else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
RegExpAtom* atom = parse_result.tree->AsAtom();
Vector<const uc16> atom_pattern = atom->data();
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern),
Object);
if (!HasFewDifferentCharacters(atom_string)) {
AtomCompile(re, pattern, flags, atom_string);
has_been_compiled = true;
}
}
if (!has_been_compiled) {
IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
}
DCHECK(re->data()->IsFixedArray());
// Compilation succeeded so the data is set on the regexp
// and we can store it in the cache.
Handle<FixedArray> data(FixedArray::cast(re->data()));
compilation_cache->PutRegExp(pattern, flags, data);
return re;
}
MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<JSObject> last_match_info) {
switch (regexp->TypeTag()) {
case JSRegExp::ATOM:
return AtomExec(regexp, subject, index, last_match_info);
case JSRegExp::IRREGEXP: {
return IrregexpExec(regexp, subject, index, last_match_info);
}
default:
UNREACHABLE();
return MaybeHandle<Object>();
}
}
// RegExp Atom implementation: Simple string search using indexOf.
void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
Handle<String> match_pattern) {
re->GetIsolate()->factory()->SetRegExpAtomData(re,
JSRegExp::ATOM,
pattern,
flags,
match_pattern);
}
static void SetAtomLastCapture(FixedArray* array,
String* subject,
int from,
int to) {
SealHandleScope shs(array->GetIsolate());
RegExpImpl::SetLastCaptureCount(array, 2);
RegExpImpl::SetLastSubject(array, subject);
RegExpImpl::SetLastInput(array, subject);
RegExpImpl::SetCapture(array, 0, from);
RegExpImpl::SetCapture(array, 1, to);
}
int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
int32_t* output,
int output_size) {
Isolate* isolate = regexp->GetIsolate();
DCHECK(0 <= index);
DCHECK(index <= subject->length());
subject = String::Flatten(subject);
DisallowHeapAllocation no_gc; // ensure vectors stay valid
String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
int needle_len = needle->length();
DCHECK(needle->IsFlat());
DCHECK_LT(0, needle_len);
if (index + needle_len > subject->length()) {
return RegExpImpl::RE_FAILURE;
}
for (int i = 0; i < output_size; i += 2) {
String::FlatContent needle_content = needle->GetFlatContent();
String::FlatContent subject_content = subject->GetFlatContent();
DCHECK(needle_content.IsFlat());
DCHECK(subject_content.IsFlat());
// dispatch on type of strings
index =
(needle_content.IsOneByte()
? (subject_content.IsOneByte()
? SearchString(isolate, subject_content.ToOneByteVector(),
needle_content.ToOneByteVector(), index)
: SearchString(isolate, subject_content.ToUC16Vector(),
needle_content.ToOneByteVector(), index))
: (subject_content.IsOneByte()
? SearchString(isolate, subject_content.ToOneByteVector(),
needle_content.ToUC16Vector(), index)
: SearchString(isolate, subject_content.ToUC16Vector(),
needle_content.ToUC16Vector(), index)));
if (index == -1) {
return i / 2; // Return number of matches.
} else {
output[i] = index;
output[i+1] = index + needle_len;
index += needle_len;
}
}
return output_size / 2;
}
Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
int index,
Handle<JSObject> last_match_info) {
Isolate* isolate = re->GetIsolate();
static const int kNumRegisters = 2;
STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
SealHandleScope shs(isolate);
FixedArray* array = FixedArray::cast(last_match_info->elements());
SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
return last_match_info;
}
// Irregexp implementation.
// Ensures that the regexp object contains a compiled version of the
// source for either one-byte or two-byte subject strings.
// If the compiled version doesn't already exist, it is compiled
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte) {
Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
#ifdef V8_INTERPRETED_REGEXP
if (compiled_code->IsByteArray()) return true;
#else // V8_INTERPRETED_REGEXP (RegExp native code)
if (compiled_code->IsCode()) return true;
#endif
// We could potentially have marked this as flushable, but have kept
// a saved version if we did not flush it yet.
Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
if (saved_code->IsCode()) {
// Reinstate the code in the original place.
re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
DCHECK(compiled_code->IsSmi());
return true;
}
return CompileIrregexp(re, sample_subject, is_one_byte);
}
bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte) {
// Compile the RegExp.
Isolate* isolate = re->GetIsolate();
Zone zone(isolate->allocator());
PostponeInterruptsScope postpone(isolate);
// If we had a compilation error the last time this is saved at the
// saved code index.
Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
// When arriving here entry can only be a smi, either representing an
// uncompiled regexp, a previous compilation error, or code that has
// been flushed.
DCHECK(entry->IsSmi());
int entry_value = Smi::cast(entry)->value();
DCHECK(entry_value == JSRegExp::kUninitializedValue ||
entry_value == JSRegExp::kCompilationErrorValue ||
(entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
if (entry_value == JSRegExp::kCompilationErrorValue) {
// A previous compilation failed and threw an error which we store in
// the saved code index (we store the error message, not the actual
// error). Recreate the error object and throw it.
Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
DCHECK(error_string->IsString());
Handle<String> error_message(String::cast(error_string));
ThrowRegExpException(re, error_message);
return false;
}
JSRegExp::Flags flags = re->GetFlags();
Handle<String> pattern(re->Pattern());
pattern = String::Flatten(pattern);
RegExpCompileData compile_data;
FlatStringReader reader(isolate, pattern);
if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
&compile_data)) {
// Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
USE(ThrowRegExpException(re, pattern, compile_data.error));
return false;
}
RegExpEngine::CompilationResult result =
RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
sample_subject, is_one_byte);
if (result.error_message != NULL) {
// Unable to compile regexp.
Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
CStrVector(result.error_message)).ToHandleChecked();
ThrowRegExpException(re, error_message);
return false;
}
Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
data->set(JSRegExp::code_index(is_one_byte), result.code);
SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
int register_max = IrregexpMaxRegisterCount(*data);
if (result.num_registers > register_max) {
SetIrregexpMaxRegisterCount(*data, result.num_registers);
}
return true;
}
int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
return Smi::cast(
re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
}
void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
}
void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
Handle<FixedArray> value) {
if (value.is_null()) {
re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
} else {
re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
}
}
int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
}
int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
}
ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
}
Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
}
void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
int capture_count) {
// Initialize compiled code entries to null.
re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
JSRegExp::IRREGEXP,
pattern,
flags,
capture_count);
}
int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
Handle<String> subject) {
subject = String::Flatten(subject);
// Check representation of the underlying storage.
bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
#ifdef V8_INTERPRETED_REGEXP
// Byte-code regexp needs space allocated for all its registers.
// The result captures are copied to the start of the registers array
// if the match succeeds. This way those registers are not clobbered
// when we set the last match info from last successful match.
return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
(IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
#else // V8_INTERPRETED_REGEXP
// Native regexp only needs room to output captures. Registers are handled
// internally.
return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
#endif // V8_INTERPRETED_REGEXP
}
int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
int32_t* output,
int output_size) {
Isolate* isolate = regexp->GetIsolate();
Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
DCHECK(index >= 0);
DCHECK(index <= subject->length());
DCHECK(subject->IsFlat());
bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
#ifndef V8_INTERPRETED_REGEXP
DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
do {
EnsureCompiledIrregexp(regexp, subject, is_one_byte);
Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
// The stack is used to allocate registers for the compiled regexp code.
// This means that in case of failure, the output registers array is left
// untouched and contains the capture results from the previous successful
// match. We can use that to set the last match info lazily.
NativeRegExpMacroAssembler::Result res =
NativeRegExpMacroAssembler::Match(code,
subject,
output,
output_size,
index,
isolate);
if (res != NativeRegExpMacroAssembler::RETRY) {
DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
isolate->has_pending_exception());
STATIC_ASSERT(
static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
STATIC_ASSERT(
static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
== RE_EXCEPTION);
return static_cast<IrregexpResult>(res);
}
// If result is RETRY, the string has changed representation, and we
// must restart from scratch.
// In this case, it means we must make sure we are prepared to handle
// the, potentially, different subject (the string can switch between
// being internal and external, and even between being Latin1 and UC16,
// but the characters are always the same).
IrregexpPrepare(regexp, subject);
is_one_byte = subject->IsOneByteRepresentationUnderneath();
} while (true);
UNREACHABLE();
return RE_EXCEPTION;
#else // V8_INTERPRETED_REGEXP
DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
// We must have done EnsureCompiledIrregexp, so we can get the number of
// registers.
int number_of_capture_registers =
(IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
int32_t* raw_output = &output[number_of_capture_registers];
// We do not touch the actual capture result registers until we know there
// has been a match so that we can use those capture results to set the
// last match info.
for (int i = number_of_capture_registers - 1; i >= 0; i--) {
raw_output[i] = -1;
}
Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
isolate);
IrregexpResult result = IrregexpInterpreter::Match(isolate,
byte_codes,
subject,
raw_output,
index);
if (result == RE_SUCCESS) {
// Copy capture results to the start of the registers array.
MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
}
if (result == RE_EXCEPTION) {
DCHECK(!isolate->has_pending_exception());
isolate->StackOverflow();
}
return result;
#endif // V8_INTERPRETED_REGEXP
}
MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
Handle<String> subject,
int previous_index,
Handle<JSObject> last_match_info) {
Isolate* isolate = regexp->GetIsolate();
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
// Prepare space for the return values.
#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
if (FLAG_trace_regexp_bytecodes) {
String* pattern = regexp->Pattern();
PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
}
#endif
int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
if (required_registers < 0) {
// Compiling failed with an exception.
DCHECK(isolate->has_pending_exception());
return MaybeHandle<Object>();
}
int32_t* output_registers = NULL;
if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
output_registers = NewArray<int32_t>(required_registers);
}
std::unique_ptr<int32_t[]> auto_release(output_registers);
if (output_registers == NULL) {
output_registers = isolate->jsregexp_static_offsets_vector();
}
int res = RegExpImpl::IrregexpExecRaw(
regexp, subject, previous_index, output_registers, required_registers);
if (res == RE_SUCCESS) {
int capture_count =
IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
return SetLastMatchInfo(
last_match_info, subject, capture_count, output_registers);
}
if (res == RE_EXCEPTION) {
DCHECK(isolate->has_pending_exception());
return MaybeHandle<Object>();
}
DCHECK(res == RE_FAILURE);
return isolate->factory()->null_value();
}
static void EnsureSize(Handle<JSObject> array, uint32_t minimum_size) {
if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
array->GetElementsAccessor()->GrowCapacityAndConvert(array, minimum_size);
}
}
Handle<JSObject> RegExpImpl::SetLastMatchInfo(Handle<JSObject> last_match_info,
Handle<String> subject,
int capture_count,
int32_t* match) {
DCHECK(last_match_info->HasFastObjectElements());
int capture_register_count = (capture_count + 1) * 2;
EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
DisallowHeapAllocation no_allocation;
FixedArray* array = FixedArray::cast(last_match_info->elements());
if (match != NULL) {
for (int i = 0; i < capture_register_count; i += 2) {
SetCapture(array, i, match[i]);
SetCapture(array, i + 1, match[i + 1]);
}
}
SetLastCaptureCount(array, capture_register_count);
SetLastSubject(array, *subject);
SetLastInput(array, *subject);
return last_match_info;
}
RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject,
Isolate* isolate)
: register_array_(NULL),
register_array_size_(0),
regexp_(regexp),
subject_(subject) {
#ifdef V8_INTERPRETED_REGEXP
bool interpreted = true;
#else
bool interpreted = false;
#endif // V8_INTERPRETED_REGEXP
if (regexp_->TypeTag() == JSRegExp::ATOM) {
static const int kAtomRegistersPerMatch = 2;
registers_per_match_ = kAtomRegistersPerMatch;
// There is no distinction between interpreted and native for atom regexps.
interpreted = false;
} else {
registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
if (registers_per_match_ < 0) {
num_matches_ = -1; // Signal exception.
return;
}
}
DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
if (!interpreted) {
register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
max_matches_ = register_array_size_ / registers_per_match_;
} else {
// Global loop in interpreted regexp is not implemented. We choose
// the size of the offsets vector so that it can only store one match.
register_array_size_ = registers_per_match_;
max_matches_ = 1;
}
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
register_array_ = NewArray<int32_t>(register_array_size_);
} else {
register_array_ = isolate->jsregexp_static_offsets_vector();
}
// Set state so that fetching the results the first time triggers a call
// to the compiled regexp.
current_match_index_ = max_matches_ - 1;
num_matches_ = max_matches_;
DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
DCHECK_GE(register_array_size_, registers_per_match_);
int32_t* last_match =
®ister_array_[current_match_index_ * registers_per_match_];
last_match[0] = -1;
last_match[1] = 0;
}
int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
return last_index + 2;
}
return last_index + 1;
}
// -------------------------------------------------------------------
// Implementation of the Irregexp regular expression engine.
//
// The Irregexp regular expression engine is intended to be a complete
// implementation of ECMAScript regular expressions. It generates either
// bytecodes or native code.
// The Irregexp regexp engine is structured in three steps.
// 1) The parser generates an abstract syntax tree. See ast.cc.
// 2) From the AST a node network is created. The nodes are all
// subclasses of RegExpNode. The nodes represent states when
// executing a regular expression. Several optimizations are
// performed on the node network.
// 3) From the nodes we generate either byte codes or native code
// that can actually execute the regular expression (perform
// the search). The code generation step is described in more
// detail below.
// Code generation.
//
// The nodes are divided into four main categories.
// * Choice nodes
// These represent places where the regular expression can
// match in more than one way. For example on entry to an
// alternation (foo|bar) or a repetition (*, +, ? or {}).
// * Action nodes
// These represent places where some action should be
// performed. Examples include recording the current position
// in the input string to a register (in order to implement
// captures) or other actions on register for example in order
// to implement the counters needed for {} repetitions.
// * Matching nodes
// These attempt to match some element part of the input string.
// Examples of elements include character classes, plain strings
// or back references.
// * End nodes
// These are used to implement the actions required on finding
// a successful match or failing to find a match.
//
// The code generated (whether as byte codes or native code) maintains
// some state as it runs. This consists of the following elements:
//
// * The capture registers. Used for string captures.
// * Other registers. Used for counters etc.
// * The current position.
// * The stack of backtracking information. Used when a matching node
// fails to find a match and needs to try an alternative.
//
// Conceptual regular expression execution model:
//
// There is a simple conceptual model of regular expression execution
// which will be presented first. The actual code generated is a more
// efficient simulation of the simple conceptual model:
//
// * Choice nodes are implemented as follows:
// For each choice except the last {
// push current position
// push backtrack code location
// <generate code to test for choice>
// backtrack code location:
// pop current position
// }
// <generate code to test for last choice>
//
// * Actions nodes are generated as follows
// <push affected registers on backtrack stack>
// <generate code to perform action>
// push backtrack code location
// <generate code to test for following nodes>
// backtrack code location:
// <pop affected registers to restore their state>
// <pop backtrack location from stack and go to it>
//
// * Matching nodes are generated as follows:
// if input string matches at current position
// update current position
// <generate code to test for following nodes>
// else
// <pop backtrack location from stack and go to it>
//
// Thus it can be seen that the current position is saved and restored
// by the choice nodes, whereas the registers are saved and restored by
// by the action nodes that manipulate them.
//
// The other interesting aspect of this model is that nodes are generated
// at the point where they are needed by a recursive call to Emit(). If
// the node has already been code generated then the Emit() call will
// generate a jump to the previously generated code instead. In order to
// limit recursion it is possible for the Emit() function to put the node
// on a work list for later generation and instead generate a jump. The
// destination of the jump is resolved later when the code is generated.
//
// Actual regular expression code generation.
//
// Code generation is actually more complicated than the above. In order
// to improve the efficiency of the generated code some optimizations are
// performed
//
// * Choice nodes have 1-character lookahead.
// A choice node looks at the following character and eliminates some of
// the choices immediately based on that character. This is not yet
// implemented.
// * Simple greedy loops store reduced backtracking information.
// A quantifier like /.*foo/m will greedily match the whole input. It will
// then need to backtrack to a point where it can match "foo". The naive
// implementation of this would push each character position onto the
// backtracking stack, then pop them off one by one. This would use space
// proportional to the length of the input string. However since the "."
// can only match in one way and always has a constant length (in this case
// of 1) it suffices to store the current position on the top of the stack
// once. Matching now becomes merely incrementing the current position and
// backtracking becomes decrementing the current position and checking the
// result against the stored current position. This is faster and saves
// space.
// * The current state is virtualized.
// This is used to defer expensive operations until it is clear that they
// are needed and to generate code for a node more than once, allowing
// specialized an efficient versions of the code to be created. This is
// explained in the section below.
//
// Execution state virtualization.
//
// Instead of emitting code, nodes that manipulate the state can record their
// manipulation in an object called the Trace. The Trace object can record a
// current position offset, an optional backtrack code location on the top of
// the virtualized backtrack stack and some register changes. When a node is
// to be emitted it can flush the Trace or update it. Flushing the Trace
// will emit code to bring the actual state into line with the virtual state.
// Avoiding flushing the state can postpone some work (e.g. updates of capture
// registers). Postponing work can save time when executing the regular
// expression since it may be found that the work never has to be done as a
// failure to match can occur. In addition it is much faster to jump to a
// known backtrack code location than it is to pop an unknown backtrack
// location from the stack and jump there.
//
// The virtual state found in the Trace affects code generation. For example
// the virtual state contains the difference between the actual current
// position and the virtual current position, and matching code needs to use
// this offset to attempt a match in the correct location of the input
// string. Therefore code generated for a non-trivial trace is specialized
// to that trace. The code generator therefore has the ability to generate
// code for each node several times. In order to limit the size of the
// generated code there is an arbitrary limit on how many specialized sets of
// code may be generated for a given node. If the limit is reached, the
// trace is flushed and a generic version of the code for a node is emitted.
// This is subsequently used for that node. The code emitted for non-generic
// trace is not recorded in the node and so it cannot currently be reused in
// the event that code generation is requested for an identical trace.
void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
UNREACHABLE();
}
void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
text->AddElement(TextElement::Atom(this), zone);
}
void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
text->AddElement(TextElement::CharClass(this), zone);
}
void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
for (int i = 0; i < elements()->length(); i++)
text->AddElement(elements()->at(i), zone);
}
TextElement TextElement::Atom(RegExpAtom* atom) {
return TextElement(ATOM, atom);
}
TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
return TextElement(CHAR_CLASS, char_class);
}
int TextElement::length() const {
switch (text_type()) {
case ATOM:
return atom()->length();
case CHAR_CLASS:
return 1;
}
UNREACHABLE();
return 0;
}
DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
if (table_ == NULL) {
table_ = new(zone()) DispatchTable(zone());
DispatchTableConstructor cons(table_, ignore_case, zone());
cons.BuildTable(this);
}
return table_;
}
class FrequencyCollator {
public:
FrequencyCollator() : total_samples_(0) {
for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
frequencies_[i] = CharacterFrequency(i);
}
}
void CountCharacter(int character) {
int index = (character & RegExpMacroAssembler::kTableMask);
frequencies_[index].Increment();
total_samples_++;
}
// Does not measure in percent, but rather per-128 (the table size from the
// regexp macro assembler).
int Frequency(int in_character) {
DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
if (total_samples_ < 1) return 1; // Division by zero.
int freq_in_per128 =
(frequencies_[in_character].counter() * 128) / total_samples_;
return freq_in_per128;
}
private:
class CharacterFrequency {
public:
CharacterFrequency() : counter_(0), character_(-1) { }
explicit CharacterFrequency(int character)
: counter_(0), character_(character) { }
void Increment() { counter_++; }
int counter() { return counter_; }
int character() { return character_; }
private:
int counter_;
int character_;
};
private:
CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
int total_samples_;
};
class RegExpCompiler {
public:
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
JSRegExp::Flags flags, bool is_one_byte);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
reg_exp_too_big_ = true;
return next_register_;
}
return next_register_++;
}
// Lookarounds to match lone surrogates for unicode character class matches
// are never nested. We can therefore reuse registers.
int UnicodeLookaroundStackRegister() {
if (unicode_lookaround_stack_register_ == kNoRegister) {
unicode_lookaround_stack_register_ = AllocateRegister();
}
return unicode_lookaround_stack_register_;
}
int UnicodeLookaroundPositionRegister() {
if (unicode_lookaround_position_register_ == kNoRegister) {
unicode_lookaround_position_register_ = AllocateRegister();
}
return unicode_lookaround_position_register_;
}
RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
RegExpNode* start,
int capture_count,
Handle<String> pattern);
inline void AddWork(RegExpNode* node) {
if (!node->on_work_list() && !node->label()->is_bound()) {
node->set_on_work_list(true);
work_list_->Add(node);
}