-
Notifications
You must be signed in to change notification settings - Fork 1k
/
qregularexpression.cpp
3092 lines (2529 loc) · 116 KB
/
qregularexpression.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (C) 2020 Giuseppe D'Angelo <dangelog@gmail.com>.
// Copyright (C) 2020 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
// Copyright (C) 2021 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include "qregularexpression.h"
#include <QtCore/qcoreapplication.h>
#include <QtCore/qhashfunctions.h>
#include <QtCore/qlist.h>
#include <QtCore/qmutex.h>
#include <QtCore/qstringlist.h>
#include <QtCore/qdebug.h>
#include <QtCore/qglobal.h>
#include <QtCore/qatomic.h>
#include <QtCore/qdatastream.h>
#if defined(Q_OS_MACOS)
#include <QtCore/private/qcore_mac_p.h>
#endif
#define PCRE2_CODE_UNIT_WIDTH 16
#include <pcre2.h>
QT_BEGIN_NAMESPACE
using namespace Qt::StringLiterals;
/*!
\class QRegularExpression
\inmodule QtCore
\reentrant
\brief The QRegularExpression class provides pattern matching using regular
expressions.
\since 5.0
\ingroup tools
\ingroup shared
\ingroup string-processing
\keyword regular expression
\compares equality
Regular expressions, or \e{regexps}, are a very powerful tool to handle
strings and texts. This is useful in many contexts, e.g.,
\table
\row \li Validation
\li A regexp can test whether a substring meets some criteria,
e.g. is an integer or contains no whitespace.
\row \li Searching
\li A regexp provides more powerful pattern matching than
simple substring matching, e.g., match one of the words
\e{mail}, \e{letter} or \e{correspondence}, but none of the
words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
\row \li Search and Replace
\li A regexp can replace all occurrences of a substring with a
different substring, e.g., replace all occurrences of \e{&}
with \e{\&} except where the \e{&} is already followed by
an \e{amp;}.
\row \li String Splitting
\li A regexp can be used to identify where a string should be
split apart, e.g. splitting tab-delimited strings.
\endtable
This document is by no means a complete reference to pattern matching using
regular expressions, and the following parts will require the reader to
have some basic knowledge about Perl-like regular expressions and their
pattern syntax.
Good references about regular expressions include:
\list
\li \e {Mastering Regular Expressions} (Third Edition) by Jeffrey E. F.
Friedl, ISBN 0-596-52812-4;
\li the \l{https://pcre.org/original/doc/html/pcrepattern.html}
{pcrepattern(3)} man page, describing the pattern syntax supported by PCRE
(the reference implementation of Perl-compatible regular expressions);
\li the \l{http://perldoc.perl.org/perlre.html} {Perl's regular expression
documentation} and the \l{http://perldoc.perl.org/perlretut.html} {Perl's
regular expression tutorial}.
\endlist
\tableofcontents
\section1 Introduction
QRegularExpression implements Perl-compatible regular expressions. It fully
supports Unicode. For an overview of the regular expression syntax
supported by QRegularExpression, please refer to the aforementioned
pcrepattern(3) man page. A regular expression is made up of two things: a
\b{pattern string} and a set of \b{pattern options} that change the
meaning of the pattern string.
You can set the pattern string by passing a string to the QRegularExpression
constructor:
\snippet code/src_corelib_text_qregularexpression.cpp 0
This sets the pattern string to \c{a pattern}. You can also use the
setPattern() function to set a pattern on an existing QRegularExpression
object:
\snippet code/src_corelib_text_qregularexpression.cpp 1
Note that due to C++ literal strings rules, you must escape all backslashes
inside the pattern string with another backslash:
\snippet code/src_corelib_text_qregularexpression.cpp 2
Alternatively, you can use a
\l {https://en.cppreference.com/w/cpp/language/string_literal} {raw string literal},
in which case you don't need to escape backslashes in the pattern, all characters
between \c {R"(...)"} are considered raw characters. As you can see in the following
example, this simplifies writing patterns:
\snippet code/src_corelib_text_qregularexpression.cpp 35
The pattern() function returns the pattern that is currently set for a
QRegularExpression object:
\snippet code/src_corelib_text_qregularexpression.cpp 3
\section1 Pattern Options
The meaning of the pattern string can be modified by setting one or more
\e{pattern options}. For instance, it is possible to set a pattern to match
case insensitively by setting the QRegularExpression::CaseInsensitiveOption.
You can set the options by passing them to the QRegularExpression
constructor, as in:
\snippet code/src_corelib_text_qregularexpression.cpp 4
Alternatively, you can use the setPatternOptions() function on an existing
QRegularExpressionObject:
\snippet code/src_corelib_text_qregularexpression.cpp 5
It is possible to get the pattern options currently set on a
QRegularExpression object by using the patternOptions() function:
\snippet code/src_corelib_text_qregularexpression.cpp 6
Please refer to the QRegularExpression::PatternOption enum documentation for
more information about each pattern option.
\section1 Match Type and Match Options
The last two arguments of the match() and the globalMatch() functions set
the match type and the match options. The match type is a value of the
QRegularExpression::MatchType enum; the "traditional" matching algorithm is
chosen by using the NormalMatch match type (the default). It is also
possible to enable partial matching of the regular expression against a
subject string: see the \l{partial matching} section for more details.
The match options are a set of one or more QRegularExpression::MatchOption
values. They change the way a specific match of a regular expression
against a subject string is done. Please refer to the
QRegularExpression::MatchOption enum documentation for more details.
\target normal matching
\section1 Normal Matching
In order to perform a match you can simply invoke the match() function
passing a string to match against. We refer to this string as the
\e{subject string}. The result of the match() function is a
QRegularExpressionMatch object that can be used to inspect the results of
the match. For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 7
If a match is successful, the (implicit) capturing group number 0 can be
used to retrieve the substring matched by the entire pattern (see also the
section about \l{extracting captured substrings}):
\snippet code/src_corelib_text_qregularexpression.cpp 8
It's also possible to start a match at an arbitrary offset inside the
subject string by passing the offset as an argument of the
match() function. In the following example \c{"12 abc"}
is not matched because the match is started at offset 1:
\snippet code/src_corelib_text_qregularexpression.cpp 9
\target extracting captured substrings
\section2 Extracting captured substrings
The QRegularExpressionMatch object contains also information about the
substrings captured by the capturing groups in the pattern string. The
\l{QRegularExpressionMatch::}{captured()} function will return the string
captured by the n-th capturing group:
\snippet code/src_corelib_text_qregularexpression.cpp 10
Capturing groups in the pattern are numbered starting from 1, and the
implicit capturing group 0 is used to capture the substring that matched
the entire pattern.
It's also possible to retrieve the starting and the ending offsets (inside
the subject string) of each captured substring, by using the
\l{QRegularExpressionMatch::}{capturedStart()} and the
\l{QRegularExpressionMatch::}{capturedEnd()} functions:
\snippet code/src_corelib_text_qregularexpression.cpp 11
All of these functions have an overload taking a QString as a parameter
in order to extract \e{named} captured substrings. For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 12
\target global matching
\section1 Global Matching
\e{Global matching} is useful to find all the occurrences of a given
regular expression inside a subject string. Suppose that we want to extract
all the words from a given string, where a word is a substring matching
the pattern \c{\w+}.
QRegularExpression::globalMatch returns a QRegularExpressionMatchIterator,
which is a Java-like forward iterator that can be used to iterate over the
results. For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 13
Since it's a Java-like iterator, the QRegularExpressionMatchIterator will
point immediately before the first result. Every result is returned as a
QRegularExpressionMatch object. The
\l{QRegularExpressionMatchIterator::}{hasNext()} function will return true
if there's at least one more result, and
\l{QRegularExpressionMatchIterator::}{next()} will return the next result
and advance the iterator. Continuing from the previous example:
\snippet code/src_corelib_text_qregularexpression.cpp 14
You can also use \l{QRegularExpressionMatchIterator::}{peekNext()} to get
the next result without advancing the iterator.
It is also possible to simply use the result of
QRegularExpression::globalMatch in a range-based for loop, for instance
like this:
\snippet code/src_corelib_text_qregularexpression.cpp 34
It is possible to pass a starting offset and one or more match options to
the globalMatch() function, exactly like normal matching with match().
\target partial matching
\section1 Partial Matching
A \e{partial match} is obtained when the end of the subject string is
reached, but more characters are needed to successfully complete the match.
Note that a partial match is usually much more inefficient than a normal
match because many optimizations of the matching algorithm cannot be
employed.
A partial match must be explicitly requested by specifying a match type of
PartialPreferCompleteMatch or PartialPreferFirstMatch when calling
QRegularExpression::match or QRegularExpression::globalMatch. If a partial
match is found, then calling the \l{QRegularExpressionMatch::}{hasMatch()}
function on the QRegularExpressionMatch object returned by match() will
return \c{false}, but \l{QRegularExpressionMatch::}{hasPartialMatch()} will return
\c{true}.
When a partial match is found, no captured substrings are returned, and the
(implicit) capturing group 0 corresponding to the whole match captures the
partially matched substring of the subject string.
Note that asking for a partial match can still lead to a complete match, if
one is found; in this case, \l{QRegularExpressionMatch::}{hasMatch()} will
return \c{true} and \l{QRegularExpressionMatch::}{hasPartialMatch()}
\c{false}. It never happens that a QRegularExpressionMatch reports both a
partial and a complete match.
Partial matching is mainly useful in two scenarios: validating user input
in real time and incremental/multi-segment matching.
\target validating user input
\section2 Validating user input
Suppose that we would like the user to input a date in a specific
format, for instance "MMM dd, yyyy". We can check the input validity with
a pattern like:
\c{^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d\d?, \d\d\d\d$}
(This pattern doesn't catch invalid days, but let's keep it for the
example's purposes).
We would like to validate the input with this regular expression \e{while}
the user is typing it, so that we can report an error in the input as soon
as it is committed (for instance, the user typed the wrong key). In order
to do so we must distinguish three cases:
\list
\li the input cannot possibly match the regular expression;
\li the input does match the regular expression;
\li the input does not match the regular expression right now,
but it will if more characters will be added to it.
\endlist
Note that these three cases represent exactly the possible states of a
QValidator (see the QValidator::State enum).
In particular, in the last case we want the regular expression engine to
report a partial match: we are successfully matching the pattern against
the subject string but the matching cannot continue because the end of the
subject is encountered. Notice, however, that the matching algorithm should
continue and try all possibilities, and in case a complete (non-partial)
match is found, then this one should be reported, and the input string
accepted as fully valid.
This behavior is implemented by the PartialPreferCompleteMatch match type.
For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 15
If matching the same regular expression against the subject string leads to
a complete match, it is reported as usual:
\snippet code/src_corelib_text_qregularexpression.cpp 16
Another example with a different pattern, showing the behavior of
preferring a complete match over a partial one:
\snippet code/src_corelib_text_qregularexpression.cpp 17
In this case, the subpattern \c{abc\\w+X} partially matches the subject
string; however, the subpattern \c{def} matches the subject string
completely, and therefore a complete match is reported.
If multiple partial matches are found when matching (but no complete
match), then the QRegularExpressionMatch object will report the first one
that is found. For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 18
\section2 Incremental/multi-segment matching
Incremental matching is another use case of partial matching. Suppose that
we want to find the occurrences of a regular expression inside a large text
(that is, substrings matching the regular expression). In order to do so we
would like to "feed" the large text to the regular expression engines in
smaller chunks. The obvious problem is what happens if the substring that
matches the regular expression spans across two or more chunks.
In this case, the regular expression engine should report a partial match,
so that we can match again adding new data and (eventually) get a complete
match. This implies that the regular expression engine may assume that
there are other characters \e{beyond the end} of the subject string. This
is not to be taken literally -- the engine will never try to access
any character after the last one in the subject.
QRegularExpression implements this behavior when using the
PartialPreferFirstMatch match type. This match type reports a partial match
as soon as it is found, and other match alternatives are not tried
(even if they could lead to a complete match). For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 19
This happens because when matching the first branch of the alternation
operator a partial match is found, and therefore matching stops, without
trying the second branch. Another example:
\snippet code/src_corelib_text_qregularexpression.cpp 20
This shows what could seem a counterintuitive behavior of quantifiers:
since \c{?} is greedy, then the engine tries first to continue the match
after having matched \c{"abc"}; but then the matching reaches the end of the
subject string, and therefore a partial match is reported. This is
even more surprising in the following example:
\snippet code/src_corelib_text_qregularexpression.cpp 21
It's easy to understand this behavior if we remember that the engine
expects the subject string to be only a substring of the whole text we're
looking for a match into (that is, how we said before, that the engine
assumes that there are other characters beyond the end of the subject
string).
Since the \c{*} quantifier is greedy, then reporting a complete match could
be an error, because after the current subject \c{"abc"} there may be other
occurrences of \c{"abc"}. For instance, the complete text could have been
"abcabcX", and therefore the \e{right} match to report (in the complete
text) would have been \c{"abcabc"}; by matching only against the leading
\c{"abc"} we instead get a partial match.
\section1 Error Handling
It is possible for a QRegularExpression object to be invalid because of
syntax errors in the pattern string. The isValid() function will return
true if the regular expression is valid, or false otherwise:
\snippet code/src_corelib_text_qregularexpression.cpp 22
You can get more information about the specific error by calling the
errorString() function; moreover, the patternErrorOffset() function
will return the offset inside the pattern string
\snippet code/src_corelib_text_qregularexpression.cpp 23
If a match is attempted with an invalid QRegularExpression, then the
returned QRegularExpressionMatch object will be invalid as well (that is,
its \l{QRegularExpressionMatch::}{isValid()} function will return false).
The same applies for attempting a global match.
\section1 Unsupported Perl-compatible Regular Expressions Features
QRegularExpression does not support all the features available in
Perl-compatible regular expressions. The most notable one is the fact that
duplicated names for capturing groups are not supported, and using them can
lead to undefined behavior.
This may change in a future version of Qt.
\section1 Debugging Code that Uses QRegularExpression
QRegularExpression internally uses a just in time compiler (JIT) to
optimize the execution of the matching algorithm. The JIT makes extensive
usage of self-modifying code, which can lead debugging tools such as
Valgrind to crash. You must enable all checks for self-modifying code if
you want to debug programs using QRegularExpression (for instance, Valgrind's
\c{--smc-check} command line option). The downside of enabling such checks
is that your program will run considerably slower.
To avoid that, the JIT is disabled by default if you compile Qt in debug
mode. It is possible to override the default and enable or disable the JIT
usage (both in debug or release mode) by setting the
\c{QT_ENABLE_REGEXP_JIT} environment variable to a non-zero or zero value
respectively.
\sa QRegularExpressionMatch, QRegularExpressionMatchIterator
*/
/*!
\class QRegularExpressionMatch
\inmodule QtCore
\reentrant
\brief The QRegularExpressionMatch class provides the results of a matching
a QRegularExpression against a string.
\since 5.0
\ingroup tools
\ingroup shared
\ingroup string-processing
\keyword regular expression match
A QRegularExpressionMatch object can be obtained by calling the
QRegularExpression::match() function, or as a single result of a global
match from a QRegularExpressionMatchIterator.
The success or the failure of a match attempt can be inspected by calling
the hasMatch() function. QRegularExpressionMatch also reports a successful
partial match through the hasPartialMatch() function.
In addition, QRegularExpressionMatch returns the substrings captured by the
capturing groups in the pattern string. The implicit capturing group with
index 0 captures the result of the whole match. The captured() function
returns each substring captured, either by the capturing group's index or
by its name:
\snippet code/src_corelib_text_qregularexpression.cpp 29
For each captured substring it is possible to query its starting and ending
offsets in the subject string by calling the capturedStart() and the
capturedEnd() function, respectively. The length of each captured
substring is available using the capturedLength() function.
The convenience function capturedTexts() will return \e{all} the captured
substrings at once (including the substring matched by the entire pattern)
in the order they have been captured by capturing groups; that is,
\c{captured(i) == capturedTexts().at(i)}.
You can retrieve the QRegularExpression object the subject string was
matched against by calling the regularExpression() function; the
match type and the match options are available as well by calling
the matchType() and the matchOptions() respectively.
Please refer to the QRegularExpression documentation for more information
about the Qt regular expression classes.
\sa QRegularExpression
*/
/*!
\class QRegularExpressionMatchIterator
\inmodule QtCore
\reentrant
\brief The QRegularExpressionMatchIterator class provides an iterator on
the results of a global match of a QRegularExpression object against a string.
\since 5.0
\ingroup tools
\ingroup shared
\ingroup string-processing
\keyword regular expression iterator
A QRegularExpressionMatchIterator object is a forward only Java-like
iterator; it can be obtained by calling the
QRegularExpression::globalMatch() function. A new
QRegularExpressionMatchIterator will be positioned before the first result.
You can then call the hasNext() function to check if there are more
results available; if so, the next() function will return the next
result and advance the iterator.
Each result is a QRegularExpressionMatch object holding all the information
for that result (including captured substrings).
For instance:
\snippet code/src_corelib_text_qregularexpression.cpp 30
Moreover, QRegularExpressionMatchIterator offers a peekNext() function
to get the next result \e{without} advancing the iterator.
Starting with Qt 6.0, it is also possible to simply use the result of
QRegularExpression::globalMatch in a range-based for loop, for instance
like this:
\snippet code/src_corelib_text_qregularexpression.cpp 34
You can retrieve the QRegularExpression object the subject string was
matched against by calling the regularExpression() function; the
match type and the match options are available as well by calling
the matchType() and the matchOptions() respectively.
Please refer to the QRegularExpression documentation for more information
about the Qt regular expression classes.
\sa QRegularExpression, QRegularExpressionMatch
*/
/*!
\enum QRegularExpression::PatternOption
The PatternOption enum defines modifiers to the way the pattern string
should be interpreted, and therefore the way the pattern matches against a
subject string.
\value NoPatternOption
No pattern options are set.
\value CaseInsensitiveOption
The pattern should match against the subject string in a case
insensitive way. This option corresponds to the /i modifier in Perl
regular expressions.
\value DotMatchesEverythingOption
The dot metacharacter (\c{.}) in the pattern string is allowed to match
any character in the subject string, including newlines (normally, the
dot does not match newlines). This option corresponds to the \c{/s}
modifier in Perl regular expressions.
\value MultilineOption
The caret (\c{^}) and the dollar (\c{$}) metacharacters in the pattern
string are allowed to match, respectively, immediately after and
immediately before any newline in the subject string, as well as at the
very beginning and at the very end of the subject string. This option
corresponds to the \c{/m} modifier in Perl regular expressions.
\value ExtendedPatternSyntaxOption
Any whitespace in the pattern string which is not escaped and outside a
character class is ignored. Moreover, an unescaped sharp (\b{#})
outside a character class causes all the following characters, until
the first newline (included), to be ignored. This can be used to
increase the readability of a pattern string as well as put comments
inside regular expressions; this is particularly useful if the pattern
string is loaded from a file or written by the user, because in C++
code it is always possible to use the rules for string literals to put
comments outside the pattern string. This option corresponds to the \c{/x}
modifier in Perl regular expressions.
\value InvertedGreedinessOption
The greediness of the quantifiers is inverted: \c{*}, \c{+}, \c{?},
\c{{m,n}}, etc. become lazy, while their lazy versions (\c{*?},
\c{+?}, \c{??}, \c{{m,n}?}, etc.) become greedy. There is no equivalent
for this option in Perl regular expressions.
\value DontCaptureOption
The non-named capturing groups do not capture substrings; named
capturing groups still work as intended, as well as the implicit
capturing group number 0 corresponding to the entire match. There is no
equivalent for this option in Perl regular expressions.
\value UseUnicodePropertiesOption
The meaning of the \c{\w}, \c{\d}, etc., character classes, as well as
the meaning of their counterparts (\c{\W}, \c{\D}, etc.), is changed
from matching ASCII characters only to matching any character with the
corresponding Unicode property. For instance, \c{\d} is changed to
match any character with the Unicode Nd (decimal digit) property;
\c{\w} to match any character with either the Unicode L (letter) or N
(digit) property, plus underscore, and so on. This option corresponds
to the \c{/u} modifier in Perl regular expressions.
*/
/*!
\enum QRegularExpression::MatchType
The MatchType enum defines the type of the match that should be attempted
against the subject string.
\value NormalMatch
A normal match is done.
\value PartialPreferCompleteMatch
The pattern string is matched partially against the subject string. If
a partial match is found, then it is recorded, and other matching
alternatives are tried as usual. If a complete match is then found,
then it's preferred to the partial match; in this case only the
complete match is reported. If instead no complete match is found (but
only the partial one), then the partial one is reported.
\value PartialPreferFirstMatch
The pattern string is matched partially against the subject string. If
a partial match is found, then matching stops and the partial match is
reported. In this case, other matching alternatives (potentially
leading to a complete match) are not tried. Moreover, this match type
assumes that the subject string only a substring of a larger text, and
that (in this text) there are other characters beyond the end of the
subject string. This can lead to surprising results; see the discussion
in the \l{partial matching} section for more details.
\value NoMatch
No matching is done. This value is returned as the match type by a
default constructed QRegularExpressionMatch or
QRegularExpressionMatchIterator. Using this match type is not very
useful for the user, as no matching ever happens. This enum value
has been introduced in Qt 5.1.
*/
/*!
\enum QRegularExpression::MatchOption
\value NoMatchOption
No match options are set.
\value AnchoredMatchOption
Use AnchorAtOffsetMatchOption instead.
\value AnchorAtOffsetMatchOption
The match is constrained to start exactly at the offset passed to
match() in order to be successful, even if the pattern string does not
contain any metacharacter that anchors the match at that point.
Note that passing this option does not anchor the end of the match
to the end of the subject; if you want to fully anchor a regular
expression, use anchoredPattern().
This enum value has been introduced in Qt 6.0.
\value DontCheckSubjectStringMatchOption
The subject string is not checked for UTF-16 validity before
attempting a match. Use this option with extreme caution, as
attempting to match an invalid string may crash the program and/or
constitute a security issue. This enum value has been introduced in
Qt 5.4.
*/
/*!
\internal
*/
static int convertToPcreOptions(QRegularExpression::PatternOptions patternOptions)
{
int options = 0;
if (patternOptions & QRegularExpression::CaseInsensitiveOption)
options |= PCRE2_CASELESS;
if (patternOptions & QRegularExpression::DotMatchesEverythingOption)
options |= PCRE2_DOTALL;
if (patternOptions & QRegularExpression::MultilineOption)
options |= PCRE2_MULTILINE;
if (patternOptions & QRegularExpression::ExtendedPatternSyntaxOption)
options |= PCRE2_EXTENDED;
if (patternOptions & QRegularExpression::InvertedGreedinessOption)
options |= PCRE2_UNGREEDY;
if (patternOptions & QRegularExpression::DontCaptureOption)
options |= PCRE2_NO_AUTO_CAPTURE;
if (patternOptions & QRegularExpression::UseUnicodePropertiesOption)
options |= PCRE2_UCP;
return options;
}
/*!
\internal
*/
static int convertToPcreOptions(QRegularExpression::MatchOptions matchOptions)
{
int options = 0;
if (matchOptions & QRegularExpression::AnchorAtOffsetMatchOption)
options |= PCRE2_ANCHORED;
if (matchOptions & QRegularExpression::DontCheckSubjectStringMatchOption)
options |= PCRE2_NO_UTF_CHECK;
return options;
}
struct QRegularExpressionPrivate : QSharedData
{
QRegularExpressionPrivate();
~QRegularExpressionPrivate();
QRegularExpressionPrivate(const QRegularExpressionPrivate &other);
void cleanCompiledPattern();
void compilePattern();
void getPatternInfo();
void optimizePattern();
enum CheckSubjectStringOption {
CheckSubjectString,
DontCheckSubjectString
};
void doMatch(QRegularExpressionMatchPrivate *priv,
qsizetype offset,
CheckSubjectStringOption checkSubjectStringOption = CheckSubjectString,
const QRegularExpressionMatchPrivate *previous = nullptr) const;
int captureIndexForName(QAnyStringView name) const;
// sizeof(QSharedData) == 4, so start our members with an enum
QRegularExpression::PatternOptions patternOptions;
QString pattern;
// *All* of the following members are managed while holding this mutex,
// except for isDirty which is set to true by QRegularExpression setters
// (right after a detach happened).
mutable QMutex mutex;
// The PCRE code pointer is reference-counted by the QRegularExpressionPrivate
// objects themselves; when the private is copied (i.e. a detach happened)
// it is set to nullptr
pcre2_code_16 *compiledPattern;
int errorCode;
qsizetype errorOffset;
int capturingCount;
bool usingCrLfNewlines;
bool isDirty;
};
struct QRegularExpressionMatchPrivate : QSharedData
{
QRegularExpressionMatchPrivate(const QRegularExpression &re,
const QString &subjectStorage,
QStringView subject,
QRegularExpression::MatchType matchType,
QRegularExpression::MatchOptions matchOptions);
QRegularExpressionMatch nextMatch() const;
const QRegularExpression regularExpression;
// subject is what we match upon. If we've been asked to match over
// a QString, then subjectStorage is a copy of that string
// (so that it's kept alive by us)
const QString subjectStorage;
const QStringView subject;
const QRegularExpression::MatchType matchType;
const QRegularExpression::MatchOptions matchOptions;
// the capturedOffsets vector contains pairs of (start, end) positions
// for each captured substring
QList<qsizetype> capturedOffsets;
int capturedCount = 0;
bool hasMatch = false;
bool hasPartialMatch = false;
bool isValid = false;
};
struct QRegularExpressionMatchIteratorPrivate : QSharedData
{
QRegularExpressionMatchIteratorPrivate(const QRegularExpression &re,
QRegularExpression::MatchType matchType,
QRegularExpression::MatchOptions matchOptions,
const QRegularExpressionMatch &next);
bool hasNext() const;
QRegularExpressionMatch next;
const QRegularExpression regularExpression;
const QRegularExpression::MatchType matchType;
const QRegularExpression::MatchOptions matchOptions;
};
/*!
\internal
Used to centralize the warning about using an invalid QRegularExpression.
In case the pattern is an illegal UTF-16 string, we can't pass print it
(pass it to qUtf16Printable, etc.), so we need to check for that.
*/
Q_DECL_COLD_FUNCTION
void qtWarnAboutInvalidRegularExpression(const QString &pattern, const char *where)
{
if (pattern.isValidUtf16()) {
qWarning("%s(): called on an invalid QRegularExpression object "
"(pattern is '%ls')", where, qUtf16Printable(pattern));
} else {
qWarning("%s(): called on an invalid QRegularExpression object", where);
}
}
/*!
\internal
*/
QRegularExpression::QRegularExpression(QRegularExpressionPrivate &dd)
: d(&dd)
{
}
/*!
\internal
*/
QRegularExpressionPrivate::QRegularExpressionPrivate()
: QSharedData(),
patternOptions(),
pattern(),
mutex(),
compiledPattern(nullptr),
errorCode(0),
errorOffset(-1),
capturingCount(0),
usingCrLfNewlines(false),
isDirty(true)
{
}
/*!
\internal
*/
QRegularExpressionPrivate::~QRegularExpressionPrivate()
{
cleanCompiledPattern();
}
/*!
\internal
Copies the private, which means copying only the pattern and the pattern
options. The compiledPattern pointer is NOT copied (we
do not own it any more), and in general all the members set when
compiling a pattern are set to default values. isDirty is set back to true
so that the pattern has to be recompiled again.
*/
QRegularExpressionPrivate::QRegularExpressionPrivate(const QRegularExpressionPrivate &other)
: QSharedData(other),
patternOptions(other.patternOptions),
pattern(other.pattern),
mutex(),
compiledPattern(nullptr),
errorCode(0),
errorOffset(-1),
capturingCount(0),
usingCrLfNewlines(false),
isDirty(true)
{
}
/*!
\internal
*/
void QRegularExpressionPrivate::cleanCompiledPattern()
{
pcre2_code_free_16(compiledPattern);
compiledPattern = nullptr;
errorCode = 0;
errorOffset = -1;
capturingCount = 0;
usingCrLfNewlines = false;
}
/*!
\internal
*/
void QRegularExpressionPrivate::compilePattern()
{
const QMutexLocker lock(&mutex);
if (!isDirty)
return;
isDirty = false;
cleanCompiledPattern();
int options = convertToPcreOptions(patternOptions);
options |= PCRE2_UTF;
PCRE2_SIZE patternErrorOffset;
compiledPattern = pcre2_compile_16(reinterpret_cast<PCRE2_SPTR16>(pattern.constData()),
pattern.size(),
options,
&errorCode,
&patternErrorOffset,
nullptr);
if (!compiledPattern) {
errorOffset = qsizetype(patternErrorOffset);
return;
} else {
// ignore whatever PCRE2 wrote into errorCode -- leave it to 0 to mean "no error"
errorCode = 0;
}
optimizePattern();
getPatternInfo();
}
/*!
\internal
*/
void QRegularExpressionPrivate::getPatternInfo()
{
Q_ASSERT(compiledPattern);
pcre2_pattern_info_16(compiledPattern, PCRE2_INFO_CAPTURECOUNT, &capturingCount);
// detect the settings for the newline
unsigned int patternNewlineSetting;
if (pcre2_pattern_info_16(compiledPattern, PCRE2_INFO_NEWLINE, &patternNewlineSetting) != 0) {
// no option was specified in the regexp, grab PCRE build defaults
pcre2_config_16(PCRE2_CONFIG_NEWLINE, &patternNewlineSetting);
}
usingCrLfNewlines = (patternNewlineSetting == PCRE2_NEWLINE_CRLF) ||
(patternNewlineSetting == PCRE2_NEWLINE_ANY) ||
(patternNewlineSetting == PCRE2_NEWLINE_ANYCRLF);
unsigned int hasJOptionChanged;
pcre2_pattern_info_16(compiledPattern, PCRE2_INFO_JCHANGED, &hasJOptionChanged);
if (Q_UNLIKELY(hasJOptionChanged)) {
qWarning("QRegularExpressionPrivate::getPatternInfo(): the pattern '%ls'\n is using the (?J) option; duplicate capturing group names are not supported by Qt",
qUtf16Printable(pattern));
}
}
/*
Simple "smartpointer" wrapper around a pcre2_jit_stack_16, to be used with
QThreadStorage.
*/
namespace {
struct PcreJitStackFree
{
void operator()(pcre2_jit_stack_16 *stack)
{
if (stack)
pcre2_jit_stack_free_16(stack);
}
};
Q_CONSTINIT static thread_local std::unique_ptr<pcre2_jit_stack_16, PcreJitStackFree> jitStacks;
}
/*!
\internal
*/
static pcre2_jit_stack_16 *qtPcreCallback(void *)
{
return jitStacks.get();
}
/*!
\internal
*/
static bool isJitEnabled()
{
QByteArray jitEnvironment = qgetenv("QT_ENABLE_REGEXP_JIT");
if (!jitEnvironment.isEmpty()) {
bool ok;
int enableJit = jitEnvironment.toInt(&ok);
return ok ? (enableJit != 0) : true;
}
#ifdef QT_DEBUG
return false;
#elif defined(Q_OS_MACOS)
return !qt_mac_runningUnderRosetta();
#else
return true;
#endif
}
/*!
\internal
The purpose of the function is to call pcre2_jit_compile_16, which
JIT-compiles the pattern.
It gets called when a pattern is recompiled by us (in compilePattern()),
under mutex protection.