GNU Octave 10.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
 
Loading...
Searching...
No Matches
regexp.cc
Go to the documentation of this file.
1////////////////////////////////////////////////////////////////////////
2//
3// Copyright (C) 2002-2025 The Octave Project Developers
4//
5// See the file COPYRIGHT.md in the top-level directory of this
6// distribution or <https://octave.org/copyright/>.
7//
8// This file is part of Octave.
9//
10// Octave is free software: you can redistribute it and/or modify it
11// under the terms of the GNU General Public License as published by
12// the Free Software Foundation, either version 3 of the License, or
13// (at your option) any later version.
14//
15// Octave is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18// GNU General Public License for more details.
19//
20// You should have received a copy of the GNU General Public License
21// along with Octave; see the file COPYING. If not, see
22// <https://www.gnu.org/licenses/>.
23//
24////////////////////////////////////////////////////////////////////////
25
26#if defined (HAVE_CONFIG_H)
27# include "config.h"
28#endif
29
30#include <list>
31#include <sstream>
32
33#include "oct-locbuf.h"
34#include "quit.h"
35#include "lo-regexp.h"
36#include "str-vec.h"
37
38#include "defun.h"
39#include "Cell.h"
40#include "error.h"
41#include "errwarn.h"
42#include "oct-map.h"
43#include "ovl.h"
44#include "utils.h"
45
47
48// Replace backslash escapes in a string with the real values. We need
49// two special functions instead of the one in utils.cc because the set
50// of escape sequences used for regexp patterns and replacement strings
51// is different from those used in the *printf functions.
52
53static std::string
54do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
55{
56 std::string retval;
57
58 std::size_t i = 0;
59 std::size_t j = 0;
60 std::size_t len = s.length ();
61
62 retval.resize (len);
63
64 while (j < len)
65 {
66 if (s[j] == '\\' && j+1 < len)
67 {
68 switch (s[++j])
69 {
70 case 'b': // backspace
71 if (is_sq_str)
72 retval[i] = '\b';
73 else
74 {
75 // Pass escape sequence through
76 retval[i] = '\\';
77 retval[++i] = 'b';
78 }
79 break;
80
81 // Translate < and > to PCRE patterns for pseudo-word boundary
82 case '<': // begin word boundary
83 retval.insert (i, "(?<=\\W|^)");
84 i += 8;
85 break;
86
87 case '>': // end word boundary
88 retval.insert (i, "(?=\\W|$)");
89 i += 7;
90 break;
91
92 case 'o': // octal input
93 {
94 bool bad_esc_seq = (j+1 >= len);
95
96 bool brace = false;
97 if (! bad_esc_seq && s[++j] == '{')
98 {
99 brace = true;
100 j++;
101 }
102
103 int tmpi = 0;
104 std::size_t k;
105 for (k = j; k < std::min (j+3+brace, len); k++)
106 {
107 int digit = s[k] - '0';
108 if (digit < 0 || digit > 7)
109 break;
110 tmpi <<= 3;
111 tmpi += digit;
112 }
113 if (bad_esc_seq || (brace && s[k++] != '}'))
114 {
115 tmpi = 0;
116 warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
117 }
118 retval[i] = tmpi;
119 j = k - 1;
120 break;
121 }
122
123 default: // pass escape sequence through
124 retval[i] = '\\';
125 retval[++i] = s[j];
126 break;
127 }
128 }
129 else
130 {
131 retval[i] = s[j];
132 }
133
134 i++;
135 j++;
136 }
137
138 retval.resize (i);
139
140 return retval;
141}
142
143static std::string
144do_regexp_rep_string_escapes (const std::string& s)
145{
146 std::string retval;
147
148 std::size_t i = 0;
149 std::size_t j = 0;
150 std::size_t len = s.length ();
151
152 retval.resize (len);
153
154 while (j < len)
155 {
156 if (s[j] == '\\' && j+1 < len)
157 {
158 switch (s[++j])
159 {
160 case 'a': // alarm
161 retval[i] = '\a';
162 break;
163
164 case 'b': // backspace
165 retval[i] = '\b';
166 break;
167
168 case 'f': // formfeed
169 retval[i] = '\f';
170 break;
171
172 case 'n': // newline
173 retval[i] = '\n';
174 break;
175
176 case 'r': // carriage return
177 retval[i] = '\r';
178 break;
179
180 case 't': // horizontal tab
181 retval[i] = '\t';
182 break;
183
184 case 'v': // vertical tab
185 retval[i] = '\v';
186 break;
187
188 case '0':
189 case '1':
190 case '2':
191 case '3':
192 case '4':
193 case '5':
194 case '6':
195 case '7': // octal input
196 {
197 std::size_t k;
198 int tmpi = s[j] - '0';
199 for (k = j+1; k < std::min (j+3, len); k++)
200 {
201 int digit = s[k] - '0';
202 if (digit < 0 || digit > 7)
203 break;
204 tmpi <<= 3;
205 tmpi += digit;
206 }
207 retval[i] = tmpi;
208 j = k - 1;
209 break;
210 }
211
212 case 'o': // octal input
213 {
214 bool bad_esc_seq = (j+1 >= len);
215
216 bool brace = false;
217 if (! bad_esc_seq && s[++j] == '{')
218 {
219 brace = true;
220 j++;
221 }
222
223 int tmpi = 0;
224 std::size_t k;
225 for (k = j; k < std::min (j+3+brace, len); k++)
226 {
227 int digit = s[k] - '0';
228 if (digit < 0 || digit > 7)
229 break;
230 tmpi <<= 3;
231 tmpi += digit;
232 }
233 if (bad_esc_seq || (brace && s[k++] != '}'))
234 {
235 warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
236 tmpi = 0;
237 }
238 retval[i] = tmpi;
239 j = k - 1;
240 break;
241 }
242
243 case 'x': // hex input
244 {
245 bool bad_esc_seq = (j+1 >= len);
246
247 bool brace = false;
248 if (! bad_esc_seq && s[++j] == '{')
249 {
250 brace = true;
251 j++;
252 }
253
254 int tmpi = 0;
255 std::size_t k;
256 for (k = j; k < std::min (j+2+brace, len); k++)
257 {
258 if (! isxdigit (s[k]))
259 break;
260
261 tmpi <<= 4;
262 int digit = s[k];
263 if (digit >= 'a')
264 tmpi += digit - 'a' + 10;
265 else if (digit >= 'A')
266 tmpi += digit - 'A' + 10;
267 else
268 tmpi += digit - '0';
269 }
270 if (bad_esc_seq || (brace && s[k++] != '}'))
271 {
272 warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
273 tmpi = 0;
274 }
275 retval[i] = tmpi;
276 j = k - 1;
277 break;
278 }
279
280 // Both dollar sign (for capture buffer) and backslash are
281 // passed through with their escape backslash. The processing
282 // for these must occur during the actual replacement operation
283 // in lo-regexp.cc.
284 case '$': // pass dollar sign through with escape
285 retval[i] = '\\'; retval[++i] = '$';
286 break;
287
288 case '\\': // pass backslash through with escape
289 retval[i] = '\\'; retval[++i] = '\\';
290 break;
291
292 default: // convert escaped character to unescaped char
293 retval[i] = s[j];
294 break;
295 }
296 }
297 else
298 {
299 retval[i] = s[j];
300 }
301
302 i++;
303 j++;
304 }
305
306 retval.resize (i);
307
308 return retval;
309}
310
311static void
312parse_options (regexp::opts& options, const octave_value_list& args,
313 const std::string& who, int skip, bool& extra_args)
314{
315 extra_args = false;
316
317 for (int i = skip; i < args.length (); i++)
318 {
319 std::string str;
320
321 str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());
322
323 std::transform (str.begin (), str.end (), str.begin (), tolower);
324
325 if (str.find ("once", 0) == 0)
326 options.once (true);
327 else if (str.find ("matchcase", 0) == 0)
328 options.case_insensitive (false);
329 else if (str.find ("ignorecase", 0) == 0)
330 options.case_insensitive (true);
331 else if (str.find ("dotall", 0) == 0)
332 options.dotexceptnewline (false);
333 else if (str.find ("stringanchors", 0) == 0)
334 options.lineanchors (false);
335 else if (str.find ("literalspacing", 0) == 0)
336 options.freespacing (false);
337 else if (str.find ("noemptymatch", 0) == 0)
338 options.emptymatch (false);
339 else if (str.find ("dotexceptnewline", 0) == 0)
340 options.dotexceptnewline (true);
341 else if (str.find ("lineanchors", 0) == 0)
342 options.lineanchors (true);
343 else if (str.find ("freespacing", 0) == 0)
344 options.freespacing (true);
345 else if (str.find ("emptymatch", 0) == 0)
346 options.emptymatch (true);
347 else if (str.find ("start", 0) == 0
348 || str.find ("end", 0) == 0
349 || str.find ("tokenextents", 0) == 0
350 || str.find ("match", 0) == 0
351 || str.find ("tokens", 0) == 0
352 || str.find ("names", 0) == 0
353 || str.find ("split", 0) == 0)
354 extra_args = true;
355 else
356 error ("%s: unrecognized option", who.c_str ());
357 }
358}
359
361octregexp (const octave_value_list& args, int nargout,
362 const std::string& who, bool case_insensitive = false)
363{
364 octave_value_list retval;
365
366 int nargin = args.length ();
367
368 // Make sure we have string, pattern
369 const std::string buffer = args(0).string_value ();
370
371 std::string pattern = args(1).string_value ();
372
373 // Rewrite pattern for PCRE
374 pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
375
376 regexp::opts options;
377 options.case_insensitive (case_insensitive);
378 bool extra_options = false;
379 parse_options (options, args, who, 2, extra_options);
380
381 const regexp::match_data rx_lst
382 = regexp::match (pattern, buffer, options, who);
383
384 string_vector named_pats = rx_lst.named_patterns ();
385
386 std::size_t sz = rx_lst.size ();
387
388 // Converted the linked list in the correct form for the return values
389
390 octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);
391
392 retval.resize (7);
393
394 if (sz != 0)
395 {
396 for (int j = 0; j < named_pats.numel (); j++)
397 {
398 Cell ctmp (dim_vector (1, sz));
399 octave_idx_type i = 0;
400
401 for (const auto& match_data : rx_lst)
402 {
403 string_vector named_tokens = match_data.named_tokens ();
404
405 ctmp(i++) = named_tokens(j);
406 }
407
408 nmap.assign (named_pats(j), ctmp);
409 }
410 }
411 retval(5) = nmap;
412
413 if (options.once ())
414 {
415 auto p = rx_lst.begin ();
416
417 retval(4) = (sz ? p->tokens () : Cell ());
418 retval(3) = (sz ? p->match_string () : "");
419 retval(2) = (sz ? p->token_extents () : Matrix ());
420
421 if (sz)
422 {
423 double start = p->start ();
424 double end = p->end ();
425
426 Cell split (dim_vector (1, 2));
427 split(0) = buffer.substr (0, start-1);
428 split(1) = buffer.substr (end);
429
430 retval(6) = split;
431 retval(1) = end;
432 retval(0) = start;
433 }
434 else
435 {
436 retval(6) = buffer;
437 retval(1) = Matrix ();
438 retval(0) = Matrix ();
439 }
440 }
441 else
442 {
443 Cell tokens (dim_vector (1, sz));
444 Cell match_string (dim_vector (1, sz));
445 Cell token_extents (dim_vector (1, sz));
446 NDArray end (dim_vector (1, sz));
447 NDArray start (dim_vector (1, sz));
448 Cell split (dim_vector (1, sz+1));
449 std::size_t sp_start = 0;
450
451 octave_idx_type i = 0;
452 for (const auto& match_data : rx_lst)
453 {
454 double s = match_data.start ();
455 double e = match_data.end ();
456
457 string_vector tmp = match_data.tokens ();
458 tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
459 match_string(i) = match_data.match_string ();
460 token_extents(i) = match_data.token_extents ();
461 end(i) = e;
462 start(i) = s;
463 split(i) = buffer.substr (sp_start, s-sp_start-1);
464 sp_start = e;
465 i++;
466 }
467
468 split(i) = buffer.substr (sp_start);
469
470 retval(6) = split;
471 retval(4) = tokens;
472 retval(3) = match_string;
473 retval(2) = token_extents;
474 retval(1) = end;
475 retval(0) = start;
476 }
477
478 // Alter the order of the output arguments
479
480 if (extra_options)
481 {
482 int n = 0;
483 octave_value_list new_retval;
484 new_retval.resize (nargout);
485
486 bool arg_used[7] {};
487
488 for (int j = 2; j < nargin; j++)
489 {
490 int k = 0;
491 std::string str = args(j).string_value ();
492 std::transform (str.begin (), str.end (), str.begin (), tolower);
493
494 if (str.find ("once", 0) == 0
495 || str.find ("stringanchors", 0) == 0
496 || str.find ("lineanchors", 0) == 0
497 || str.find ("matchcase", 0) == 0
498 || str.find ("ignorecase", 0) == 0
499 || str.find ("dotall", 0) == 0
500 || str.find ("dotexceptnewline", 0) == 0
501 || str.find ("literalspacing", 0) == 0
502 || str.find ("freespacing", 0) == 0
503 || str.find ("noemptymatch", 0) == 0
504 || str.find ("emptymatch", 0) == 0)
505 continue;
506 else if (str.find ("start", 0) == 0)
507 k = 0;
508 else if (str.find ("end", 0) == 0)
509 k = 1;
510 else if (str.find ("tokenextents", 0) == 0)
511 k = 2;
512 else if (str.find ("match", 0) == 0)
513 k = 3;
514 else if (str.find ("tokens", 0) == 0)
515 k = 4;
516 else if (str.find ("names", 0) == 0)
517 k = 5;
518 else if (str.find ("split", 0) == 0)
519 k = 6;
520
521 new_retval(n++) = retval(k);
522 arg_used[k] = true;
523
524 if (n == nargout)
525 break;
526 }
527
528 // Fill in the rest of the arguments
529 if (n < nargout)
530 {
531 for (int j = 0; j < 7; j++)
532 {
533 if (! arg_used[j])
534 new_retval(n++) = retval(j);
535 }
536 }
537
538 retval = new_retval;
539 }
540
541 return retval;
542}
543
545octcellregexp (const octave_value_list& args, int nargout,
546 const std::string& who, bool case_insensitive = false)
547{
548 octave_value_list retval;
549
550 if (args(0).iscell ())
551 {
552 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
553 octave_value_list new_args = args;
554 Cell cellstr = args(0).cell_value ();
555 if (args(1).iscell ())
556 {
557 Cell cellpat = args(1).cell_value ();
558
559 if (cellpat.numel () == 1)
560 {
561 for (int j = 0; j < nargout; j++)
562 newretval[j].resize (cellstr.dims ());
563
564 new_args(1) = cellpat(0);
565
566 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
567 {
568 new_args(0) = cellstr(i);
569 octave_value_list tmp = octregexp (new_args, nargout, who,
570 case_insensitive);
571
572 for (int j = 0; j < nargout; j++)
573 newretval[j](i) = tmp(j);
574 }
575 }
576 else if (cellstr.numel () == 1)
577 {
578 for (int j = 0; j < nargout; j++)
579 newretval[j].resize (cellpat.dims ());
580
581 new_args(0) = cellstr(0);
582
583 for (octave_idx_type i = 0; i < cellpat.numel (); i++)
584 {
585 new_args(1) = cellpat(i);
586 octave_value_list tmp = octregexp (new_args, nargout, who,
587 case_insensitive);
588
589 for (int j = 0; j < nargout; j++)
590 newretval[j](i) = tmp(j);
591 }
592 }
593 else if (cellstr.numel () == cellpat.numel ())
594 {
595 if (cellstr.dims () != cellpat.dims ())
596 error ("%s: inconsistent cell array dimensions", who.c_str ());
597
598 for (int j = 0; j < nargout; j++)
599 newretval[j].resize (cellstr.dims ());
600
601 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
602 {
603 new_args(0) = cellstr(i);
604 new_args(1) = cellpat(i);
605
606 octave_value_list tmp = octregexp (new_args, nargout, who,
607 case_insensitive);
608
609 for (int j = 0; j < nargout; j++)
610 newretval[j](i) = tmp(j);
611 }
612 }
613 else
614 error ("regexp: cell array arguments must be scalar or equal size");
615 }
616 else
617 {
618 for (int j = 0; j < nargout; j++)
619 newretval[j].resize (cellstr.dims ());
620
621 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
622 {
623 new_args(0) = cellstr(i);
624 octave_value_list tmp = octregexp (new_args, nargout, who,
625 case_insensitive);
626
627 for (int j = 0; j < nargout; j++)
628 newretval[j](i) = tmp(j);
629 }
630 }
631
632 for (int j = 0; j < nargout; j++)
633 retval(j) = octave_value (newretval[j]);
634 }
635 else if (args(1).iscell ())
636 {
637 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
638 octave_value_list new_args = args;
639 Cell cellpat = args(1).cell_value ();
640
641 for (int j = 0; j < nargout; j++)
642 newretval[j].resize (cellpat.dims ());
643
644 for (octave_idx_type i = 0; i < cellpat.numel (); i++)
645 {
646 new_args(1) = cellpat(i);
647 octave_value_list tmp = octregexp (new_args, nargout, who,
648 case_insensitive);
649
650 for (int j = 0; j < nargout; j++)
651 newretval[j](i) = tmp(j);
652 }
653
654 for (int j = 0; j < nargout; j++)
655 retval(j) = octave_value (newretval[j]);
656 }
657 else
658 retval = octregexp (args, nargout, who, case_insensitive);
659
660 return retval;
661
662}
663
664DEFUN (regexp, args, nargout,
665 doc: /* -*- texinfo -*-
666@deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
667@deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
668Regular expression string matching.
669
670Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
671substrings of any matches, or empty values if there are none.
672
673The matched pattern @var{pat} can include any of the standard regex
674operators, including:
675
676@table @code
677@item .
678Match any character
679
680@item * + ? @{@}
681Repetition operators, representing
682
683@table @code
684@item *
685Match zero or more times
686
687@item +
688Match one or more times
689
690@item ?
691Match zero or one times
692
693@item @{@var{n}@}
694Match exactly @var{n} times
695
696@item @{@var{n},@}
697Match @var{n} or more times
698
699@item @{@var{m},@var{n}@}
700Match between @var{m} and @var{n} times
701@end table
702
703@item [@dots{}] [^@dots{}]
704
705List operators. The pattern will match any character listed between
706@qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the
707pattern is inverted and any character except those listed between brackets
708will match.
709
710Escape sequences defined below can also be used inside list operators. For
711example, a template for a floating point number might be @code{[-+.\d]+}.
712
713@item () (?:)
714Grouping operator. The first form, parentheses only, also creates a token.
715
716@item |
717Alternation operator. Match one of a choice of regular expressions. The
718alternatives must be delimited by the grouping operator @code{()} above.
719
720@item ^ $
721Anchoring operators. Requires pattern to occur at the start (@code{^}) or
722end (@code{$}) of the string.
723@end table
724
725In addition, the following escaped characters have special meaning.
726
727@table @code
728
729@item \d
730Match any digit
731
732@item \D
733Match any non-digit
734
735@item \s
736Match any whitespace character
737
738@item \S
739Match any non-whitespace character
740
741@item \w
742Match any word character
743
744@item \W
745Match any non-word character
746
747@item <
748Match the beginning of a word
749
750@item >
751Match the end of a word
752
753@item \B
754Match within a word
755@end table
756
757Implementation Note: For compatibility with @sc{matlab}, escape sequences
758in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
759even when @var{pat} has been defined with single quotes. To disable
760expansion use a second backslash before the escape sequence (e.g.,
761"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
762function.
763
764The outputs of @code{regexp} default to the order given below
765
766@table @var
767@item s
768The start indices of each matching substring
769
770@item e
771The end indices of each matching substring
772
773@item te
774The extents of each matched token surrounded by @code{(@dots{})} in
775@var{pat}
776
777@item m
778A cell array of the text of each match
779
780@item t
781A cell array of the text of each token matched
782
783@item nm
784A structure containing the text of each matched named token, with the name
785being used as the fieldname. A named token is denoted by
786@code{(?<name>@dots{})}.
787
788@item sp
789A cell array of the text not returned by match, i.e., what remains if you
790split the string based on @var{pat}.
791@end table
792
793Particular output arguments, or the order of the output arguments, can be
794selected by additional @var{opt} arguments. These are strings and the
795correspondence between the output arguments and the optional argument
796are
797
798@multitable @columnfractions 0.2 0.3 0.3 0.2
799@item @tab @qcode{'start'} @tab @var{s} @tab
800@item @tab @qcode{'end'} @tab @var{e} @tab
801@item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
802@item @tab @qcode{'match'} @tab @var{m} @tab
803@item @tab @qcode{'tokens'} @tab @var{t} @tab
804@item @tab @qcode{'names'} @tab @var{nm} @tab
805@item @tab @qcode{'split'} @tab @var{sp} @tab
806@end multitable
807
808Additional arguments are summarized below.
809
810@table @samp
811@item once
812Return only the first occurrence of the pattern.
813
814@item matchcase
815Make the matching case sensitive. (default)
816
817Alternatively, use (?-i) in the pattern.
818
819@item ignorecase
820Ignore case when matching the pattern to the string.
821
822Alternatively, use (?i) in the pattern.
823
824@item stringanchors
825Match the anchor characters at the beginning and end of the string.
826(default)
827
828Alternatively, use (?-m) in the pattern.
829
830@item lineanchors
831Match the anchor characters at the beginning and end of the line.
832
833Alternatively, use (?m) in the pattern.
834
835@item dotall
836The pattern @code{.} matches all characters including the newline character.
837 (default)
838
839Alternatively, use (?s) in the pattern.
840
841@item dotexceptnewline
842The pattern @code{.} matches all characters except the newline character.
843
844Alternatively, use (?-s) in the pattern.
845
846@item literalspacing
847All characters in the pattern, including whitespace, are significant and are
848used in pattern matching. (default)
849
850Alternatively, use (?-x) in the pattern.
851
852@item freespacing
853The pattern may include arbitrary whitespace and also comments beginning
854with the character @samp{#}.
855
856Alternatively, use (?x) in the pattern.
857
858@item noemptymatch
859Zero-length matches are not returned. (default)
860
861@item emptymatch
862Return zero-length matches.
863
864@code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
865are zero or more @qcode{'b'} characters at positions 1 and end-of-string.
866
867@end table
868
869Stack Limitation Note: Pattern searches are done with a recursive function
870which can overflow the program stack when there are a high number of matches.
871For example,
872
873@example
874@code{regexp (repmat ('a', 1, 1e5), '(a)+')}
875@end example
876
877@noindent
878may lead to a segfault. As an alternative, consider constructing pattern
879searches that reduce the number of matches (e.g., by creatively using set
880complement), and then further processing the return variables (now reduced in
881size) with successive @code{regexp} searches.
882
883Octave's @code{regexp} implementation is based on the Perl Compatible
884Regular Expressions library (@url{https://www.pcre.org/}). For a more
885comprehensive list of @code{regexp} operator syntax see the
886@url{https://www.pcre.org/current/doc/html/pcre2syntax.html,,
887"PCRE Syntax quick-reference summary"}.
888
889@seealso{regexpi, strfind, regexprep}
890@end deftypefn */)
891{
892 if (args.length () < 2)
893 print_usage ();
894
895 octave_value_list retval;
896
897 if (args(0).iscell () || args(1).iscell ())
898 retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
899 else
900 retval = octregexp (args, nargout, "regexp");
901
902 return retval;
903}
904
905/*
906## PCRE_ERROR_MATCHLIMIT test
907%!test
908%! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
909%! ws = warning ("query");
910%! unwind_protect
911%! warning ("off");
912%! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
913%! unwind_protect_cleanup
914%! warning (ws);
915%! end_unwind_protect
916
917## segfault test
918%!assert (regexp ("abcde", "."), [1,2,3,4,5])
919%!assert <*62704> (regexpi('(', '\‍(?'), 1)
920## Infinite loop test
921%!assert (isempty (regexp ("abcde", "")))
922
923## Check that anchoring of pattern works correctly
924%!assert (regexp ('abcabc', '^abc'), 1)
925%!assert (regexp ('abcabc', 'abc$'), 4)
926%!assert (regexp ('abcabc', '^abc$'), zeros (1,0))
927
928## UTF-8 test with character vector "âé🙂ïõù"
929%!assert (regexp (char ([195, 162, 195, 169, 240, 159, 153, 130, 195, 175, ...
930%! 195, 181, 195, 185]), "."), [1, 3, 5, 9, 11, 13])
931
932%!test
933%! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
934%! assert (s, zeros (1,0));
935%! assert (e, zeros (1,0));
936%! assert (te, cell (1,0));
937%! assert (m, cell (1,0));
938%! assert (t, cell (1,0));
939
940%!test
941%! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
942%! assert (s, zeros (1,0));
943%! assert (e, zeros (1,0));
944%! assert (te, cell (1,0));
945%! assert (m, cell (1,0));
946%! assert (t, cell (1,0));
947
948%!test
949%! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
950%! assert (s, 2);
951%! assert (e, 10);
952%! assert (te{1}, [3, 7]);
953%! assert (m{1}, 'firetruck');
954%! assert (t{1}{1}, 'iretr');
955
956%!test
957%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
958%! assert (s, [1, 12]);
959%! assert (e, [5, 17]);
960%! assert (size (te), [1, 2]);
961%! assert (isempty (te{1}));
962%! assert (isempty (te{2}));
963%! assert (m{1}, 'short');
964%! assert (m{2}, 'string');
965%! assert (size (t), [1, 2]);
966%! assert (isempty (t{1}));
967%! assert (isempty (t{2}));
968
969%!test
970%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
971%! assert (s, 1);
972%! assert (e, 5);
973%! assert (isempty (te));
974%! assert (m, 'short');
975%! assert (isempty (t));
976
977%!test
978%! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
979%! assert (s, 1);
980%! assert (e, 5);
981%! assert (isempty (te));
982%! assert (m, 'short');
983%! assert (isempty (t));
984
985%!test
986%! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
987%! assert (s, 1);
988%! assert (e, 10);
989%! assert (size (te), [1, 1]);
990%! assert (te{1}, [1,5; 7,10]);
991%! assert (m{1}, 'short test');
992%! assert (size (t), [1, 1]);
993%! assert (t{1}{1}, 'short');
994%! assert (t{1}{2}, 'test');
995%! assert (size (nm), [1, 1]);
996%! assert (! isempty (fieldnames (nm)));
997%! assert (sort (fieldnames (nm)), {'word1';'word2'});
998%! assert (nm.word1, 'short');
999%! assert (nm.word2, 'test');
1000
1001%!test
1002%! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1003%! assert (s, 1);
1004%! assert (e, 10);
1005%! assert (size (te), [1, 1]);
1006%! assert (te{1}, [1,5; 7,10]);
1007%! assert (m{1}, 'short test');
1008%! assert (size (t), [1, 1]);
1009%! assert (t{1}{1}, 'short');
1010%! assert (t{1}{2}, 'test');
1011%! assert (size (nm), [1, 1]);
1012%! assert (! isempty (fieldnames (nm)));
1013%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1014%! assert (nm.word1, 'short');
1015%! assert (nm.word2, 'test');
1016
1017%!test
1018%! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
1019%! assert (size (t), [1, 2]);
1020%! assert (t{1}{1}, "John");
1021%! assert (t{1}{2}, "Davis");
1022%! assert (t{2}{1}, "Rogers");
1023%! assert (t{2}{2}, "James");
1024%! assert (size (nm), [1, 2]);
1025%! assert (nm(1).first, "John");
1026%! assert (nm(1).last, "Davis");
1027%! assert (nm(2).first, "James");
1028%! assert (nm(2).last, "Rogers");
1029
1030## Tests for nulls in strings properly matching
1031%!test
1032%! str = "A\0B\0\0C";
1033%! ptn = '(\0+)'; # also test null in single-quote pattern
1034%! M = regexp (str, ptn, "match");
1035%! assert (size (M), [1, 2]);
1036%! assert (double (M{1}), [0]);
1037%! assert (double (M{2}), [0, 0]);
1038
1039%!test
1040%! str = "A\0B\0\0C";
1041%! ptn = "(\0+)"; # also test null in double-quote pattern
1042%! T = regexp (str, ptn, "tokens");
1043%! assert (size (T), [1, 2]);
1044%! assert (double (T{1}{1}), [0]);
1045%! assert (double (T{2}{1}), [0, 0]);
1046
1047%!test
1048%! str = "A\0B\0\0C";
1049%! ptn = '(?<namedtoken>\0+)';
1050%! NT = regexp (str, ptn, "names");
1051%! assert (size (NT), [1, 2]);
1052%! assert (double (NT(1).namedtoken), [0]);
1053%! assert (double (NT(2).namedtoken), [0, 0]);
1054
1055## Tests for named tokens
1056%!test
1057%! ## Parenthesis in named token (ie (int)) causes a problem
1058%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
1059%! struct ('typestr', 'int'));
1060
1061%!test <*35683>
1062%! ## Mix of named and unnamed tokens can cause segfault
1063%! str = "abcde";
1064%! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
1065%! tokens = regexp (str, ptn, "names");
1066%! assert (isstruct (tokens) && numel (tokens) == 1);
1067%! assert (tokens.T1, "a");
1068%! assert (tokens.T2, "de");
1069
1070## Test options to regexp
1071%!assert (regexp ("abc\nabc", '.'), [1:7])
1072%!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
1073%!test
1074%! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
1075%! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1076%! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1077
1078%!assert (regexp ("caseCaSe", 'case'), 1)
1079%!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
1080%!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
1081%!test
1082%! assert (regexp ("caseCaSe", '(?-i)case'), 1);
1083%! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);
1084
1085%!assert (regexp ("abc\nabc", 'c$'), 7)
1086%!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
1087%!test
1088%! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
1089%! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
1090%! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);
1091
1092%!assert (regexp ("this word", 's w'), 4)
1093%!assert (regexp ("this word", 's w', 'literalspacing'), 4)
1094%!test
1095%! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
1096%! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
1097%! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
1098
1099%!test
1100%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
1101%! assert (s, [1 5]);
1102%! assert (e, [3 5]);
1103%! assert (te, { zeros(0,2), zeros(0,2) });
1104%! assert (m, { "OCT", "V" });
1105%! assert (t, { cell(1,0), cell(1,0) });
1106%! assert (isempty (fieldnames (nm)));
1107%! assert (sp, { "", "A", "E" });
1108
1109%!test
1110%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
1111%! assert (s, [1 5]);
1112%! assert (e, [3 5]);
1113%! assert (te, { [1 3], [5 5] });
1114%! assert (m, { "OCT", "V" });
1115%! assert (t, { {"OCT"}, {"V"} });
1116%! assert (isempty (fieldnames (nm)));
1117%! assert (sp, { "", "A", "E" });
1118
1119%!test
1120%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
1121%! assert (s, [1 4 5 6 7]);
1122%! assert (e, [3 3 5 5 6]);
1123%! assert (te, repmat ({zeros(0,2)}, [1, 5]));
1124%! assert (m, { "OCT", "", "V", "", "" });
1125%! assert (t, repmat({cell(1,0)}, [1, 5]));
1126%! assert (isempty (fieldnames (nm)));
1127%! assert (sp, { "", "", "A", "", "E", "" });
1128
1129%!test
1130%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
1131%! assert (s, [1 4 5 6 7]);
1132%! assert (e, [3 3 5 5 6]);
1133%! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
1134%! assert (m, { "OCT", "", "V", "", "" });
1135%! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
1136%! assert (isempty (fieldnames (nm)));
1137%! assert (sp, { "", "", "A", "", "E", "" });
1138
1139%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1140%! {6;[1,5,9];zeros(1,0)})
1141%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1142%! {6;[3,7];[1,9]})
1143%!assert (regexp ('Strings', {'t','s'}), {2, 7})
1144
1145## Test case for lookaround operators
1146%!test
1147%! assert (regexp ('Iraq', 'q(?!u)'), 4);
1148%! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
1149%! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
1150%! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
1151%! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0));
1152%! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'});
1153%! assert (regexp ('thingamabob', '(?<=a)b'), 9);
1154
1155## Tests for split option.
1156%!shared str
1157%! str = "foo bar foo";
1158%!test
1159%! [a, b] = regexp (str, "f..", "match", "split");
1160%! assert (a, {"foo", "foo"});
1161%! assert (b, {"", " bar ", ""});
1162%!test
1163%! [a, b] = regexp (str, "f..", "match", "split", "once");
1164%! assert (a, "foo");
1165%! assert (b, {"", " bar foo"});
1166%!test
1167%! [a, b] = regexp (str, "fx.", "match", "split");
1168%! assert (a, cell (1, 0));
1169%! assert (b, {"foo bar foo"});
1170%!test
1171%! [a, b] = regexp (str, "fx.", "match", "split", "once");
1172%! assert (a, "");
1173%! assert (b, "foo bar foo");
1174
1175%!shared str
1176%! str = "foo bar";
1177%!test
1178%! [a, b] = regexp (str, "f..", "match", "split");
1179%! assert (a, {"foo"});
1180%! assert (b, {"", " bar"});
1181%!test
1182%! [a, b] = regexp (str, "b..", "match", "split");
1183%! assert (a, {"bar"});
1184%! assert (b, {"foo ", ""});
1185%!test
1186%! [a, b] = regexp (str, "x", "match", "split");
1187%! assert (a, cell (1, 0));
1188%! assert (b, {"foo bar"});
1189%!test
1190%! [a, b] = regexp (str, "[o]+", "match", "split");
1191%! assert (a, {"oo"});
1192%! assert (b, {"f", " bar"});
1193
1194## Test escape sequences are expanded even in single-quoted strings
1195%!assert (regexp ("\n", '\n'), 1)
1196%!assert (regexp ("\n", "\n"), 1)
1197
1198## Test escape sequences are silently converted
1199%!test <*45407>
1200%! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
1201%! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
1202%! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
1203
1204## Test start-of-word / end-of-word patterns for Matlab compatibility
1205%!test <*59992>
1206%! assert (regexp ('foo!+bar', '<\w'), [1, 6]);
1207%! assert (regexp ('foo!+bar', '.>'), [3, 4, 8]);
1208%! assert (regexp ('foo!+bar\nbar!+foo', '.>'), [3, 4, 8, 13, 14, 18]);
1209%! assert (regexp ('foo!+bar\nbar!+foo', '<\w'), [1, 6, 10, 16]);
1210
1211## Test "incomplete" named patterns
1212%!assert <*62705> (regexpi ('<', '\‍(?<'), 1)
1213%!assert <*62705> (regexpi ('<n>', '\‍(?<n>'), 1)
1214%!assert <*62705> (regexpi ('<n>', '\‍(?<n>\‍)?'), 1)
1215%!assert <62705> (regexpi ('<n>a', '\‍(?<n>a\‍)?'), 1)
1216
1217## Test input validation
1218%!error regexp ('string', 'tri', 'BadArg')
1219%!error regexp ('string')
1220
1221*/
1222
1223DEFUN (regexpi, args, nargout,
1224 doc: /* -*- texinfo -*-
1225@deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
1226@deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})
1227
1228Case insensitive regular expression string matching.
1229
1230Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
1231substrings of any matches, or empty values if there are none.
1232@xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search
1233pattern.
1234@seealso{regexp}
1235@end deftypefn */)
1236{
1237 if (args.length () < 2)
1238 print_usage ();
1239
1240 if (args(0).iscell () || args(1).iscell ())
1241 return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
1242 else
1243 return octregexp (args, nargout, "regexpi", true);
1244}
1245
1246/*
1247## segfault test
1248%!assert (regexpi ("abcde", "."), [1,2,3,4,5])
1249
1250## Check that anchoring of pattern works correctly
1251%!assert (regexpi ('abcabc', '^ABC'), 1)
1252%!assert (regexpi ('abcabc', 'ABC$'), 4)
1253%!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))
1254
1255%!test
1256%! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
1257%! assert (s, zeros (1,0));
1258%! assert (e, zeros (1,0));
1259%! assert (te, cell (1,0));
1260%! assert (m, cell (1,0));
1261%! assert (t, cell (1,0));
1262
1263%!test
1264%! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
1265%! assert (s, 2);
1266%! assert (e, 10);
1267%! assert (te{1}, [3, 7]);
1268%! assert (m{1}, 'FiRetrUck');
1269%! assert (t{1}{1}, 'iRetr');
1270
1271%!test
1272%! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
1273%! assert (s, 2);
1274%! assert (e, 10);
1275%! assert (te{1}, [3, 7]);
1276%! assert (m{1}, 'firetruck');
1277%! assert (t{1}{1}, 'iretr');
1278
1279%!test
1280%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
1281%! assert (s, [1, 12]);
1282%! assert (e, [5, 17]);
1283%! assert (size (te), [1, 2]);
1284%! assert (isempty (te{1}));
1285%! assert (isempty (te{2}));
1286%! assert (m{1}, 'ShoRt');
1287%! assert (m{2}, 'String');
1288%! assert (size (t), [1, 2]);
1289%! assert (isempty (t{1}));
1290%! assert (isempty (t{2}));
1291
1292%!test
1293%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
1294%! assert (s, 1);
1295%! assert (e, 5);
1296%! assert (isempty (te));
1297%! assert (m, 'ShoRt');
1298%! assert (isempty (t));
1299
1300%!test
1301%! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1302%! assert (s, 1);
1303%! assert (e, 5);
1304%! assert (isempty (te));
1305%! assert (m, 'ShoRt');
1306%! assert (isempty (t));
1307
1308%!test
1309%! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
1310%! assert (s, 1);
1311%! assert (e, 10);
1312%! assert (size (te), [1, 1]);
1313%! assert (te{1}, [1,5; 7,10]);
1314%! assert (m{1}, 'ShoRt Test');
1315%! assert (size (t), [1, 1]);
1316%! assert (t{1}{1}, 'ShoRt');
1317%! assert (t{1}{2}, 'Test');
1318%! assert (size (nm), [1, 1]);
1319%! assert (! isempty (fieldnames (nm)));
1320%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1321%! assert (nm.word1, 'ShoRt');
1322%! assert (nm.word2, 'Test');
1323
1324%!test
1325%! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1326%! assert (s, 1);
1327%! assert (e, 10);
1328%! assert (size (te), [1, 1]);
1329%! assert (te{1}, [1,5; 7,10]);
1330%! assert (m{1}, 'ShoRt Test');
1331%! assert (size (t), [1, 1]);
1332%! assert (t{1}{1}, 'ShoRt');
1333%! assert (t{1}{2}, 'Test');
1334%! assert (size (nm), [1, 1]);
1335%! assert (! isempty (fieldnames (nm)));
1336%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1337%! assert (nm.word1, 'ShoRt');
1338%! assert (nm.word2, 'Test');
1339
1340%!assert (regexpi ("abc\nabc", '.'), [1:7])
1341%!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
1342%!test
1343%! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
1344%! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1345%! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1346
1347%!assert (regexpi ("caseCaSe", 'case'), [1, 5])
1348%!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
1349%!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
1350%!test
1351%! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
1352%! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);
1353
1354%!assert (regexpi ("abc\nabc", 'C$'), 7)
1355%!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
1356%!test
1357%! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
1358%! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
1359%! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);
1360
1361%!assert (regexpi ("this word", 'S w'), 4)
1362%!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
1363%!test
1364%! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
1365%! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
1366%! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));
1367
1368%!error regexpi ('string', 'tri', 'BadArg')
1369%!error regexpi ('string')
1370
1371%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1372%! {6;[1,5,9];zeros(1, 0)})
1373%!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'),
1374%! {6, [1,5,9], zeros(1,0)})
1375%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1376%! {6;[3,7];[1,9]})
1377%!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})
1378
1379%!assert (regexpi ("\n", '\n'), 1)
1380%!assert (regexpi ("\n", "\n"), 1)
1381*/
1382
1383static octave_value
1384octregexprep (const octave_value_list& args, const std::string& who)
1385{
1386 int nargin = args.length ();
1387
1388 // Make sure we have string, pattern, replacement
1389 const std::string buffer = args(0).string_value ();
1390
1391 std::string pattern = args(1).string_value ();
1392
1393 // Rewrite pattern for PCRE
1394 pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
1395
1396 std::string replacement = args(2).string_value ();
1397
1398 // Matlab compatibility.
1399 if (args(2).is_sq_string ())
1400 replacement = do_regexp_rep_string_escapes (replacement);
1401
1402 // Pack options excluding 'tokenize' and various output
1403 // reordering strings into regexp arg list
1404 octave_value_list regexpargs (nargin-3, octave_value ());
1405
1406 int len = 0;
1407 for (int i = 3; i < nargin; i++)
1408 {
1409 const std::string opt = args(i).string_value ();
1410 if (opt != "tokenize" && opt != "start" && opt != "end"
1411 && opt != "tokenextents" && opt != "match" && opt != "tokens"
1412 && opt != "names" && opt != "split" && opt != "warnings")
1413 {
1414 regexpargs(len++) = args(i);
1415 }
1416 }
1417 regexpargs.resize (len);
1418
1419 regexp::opts options;
1420 bool extra_args = false;
1421 parse_options (options, regexpargs, who, 0, extra_args);
1422
1423 return regexp::replace (pattern, buffer, replacement, options, who);
1424}
1425
1426DEFUN (regexprep, args, ,
1427 doc: /* -*- texinfo -*-
1428@deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
1429@deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
1430Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.
1431
1432The pattern is a regular expression as documented for @code{regexp}.
1433@xref{XREFregexp,,@code{regexp}}.
1434
1435All strings must be UTF-8 encoded.
1436
1437The replacement string may contain @code{$i}, which substitutes for the ith
1438set of parentheses in the match string. For example,
1439
1440@example
1441regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
1442@end example
1443
1444@noindent
1445returns @qcode{"Dunn, Bill"}
1446
1447Options in addition to those of @code{regexp} are
1448
1449@table @samp
1450
1451@item once
1452Replace only the first occurrence of @var{pat} in the result.
1453
1454@item warnings
1455This option is present for compatibility but is ignored.
1456
1457@end table
1458
1459Implementation Note: For compatibility with @sc{matlab}, escape sequences
1460in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
1461even when @var{pat} has been defined with single quotes. To disable
1462expansion use a second backslash before the escape sequence (e.g.,
1463"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
1464function.
1465@seealso{regexp, regexpi, strrep}
1466@end deftypefn */)
1467{
1468 if (args.length () < 3)
1469 print_usage ();
1470
1471 octave_value_list retval;
1472
1473 if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
1474 {
1475 Cell str, pat, rep;
1476 dim_vector dv0;
1477 dim_vector dv1 (1, 1);
1478
1479 if (args(0).iscell ())
1480 str = args(0).cell_value ();
1481 else
1482 str = Cell (args(0));
1483
1484 if (args(1).iscell ())
1485 pat = args(1).cell_value ();
1486 else
1487 pat = Cell (args(1));
1488
1489 if (args(2).iscell ())
1490 rep = args(2).cell_value ();
1491 else
1492 rep = Cell (args(2));
1493
1494 dv0 = str.dims ();
1495 if (pat.numel () != 1)
1496 {
1497 dv1 = pat.dims ();
1498 if (rep.numel () != 1 && dv1 != rep.dims ())
1499 error ("regexprep: inconsistent cell array dimensions");
1500 }
1501 else if (rep.numel () != 1)
1502 dv1 = rep.dims ();
1503
1504 Cell ret (dv0);
1505 octave_value_list new_args = args;
1506
1507 for (octave_idx_type i = 0; i < dv0.numel (); i++)
1508 {
1509 new_args(0) = str(i);
1510 if (pat.numel () == 1)
1511 new_args(1) = pat(0);
1512 if (rep.numel () == 1)
1513 new_args(2) = rep(0);
1514
1515 for (octave_idx_type j = 0; j < dv1.numel (); j++)
1516 {
1517 if (pat.numel () != 1)
1518 new_args(1) = pat(j);
1519 if (rep.numel () != 1)
1520 new_args(2) = rep(j);
1521 new_args(0) = octregexprep (new_args, "regexprep");
1522 }
1523
1524 ret(i) = new_args(0);
1525 }
1526
1527 retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
1528 }
1529 else
1530 retval = octregexprep (args, "regexprep");
1531
1532 return retval;
1533}
1534
1535/*
1536%!test # Replace with empty
1537%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1538%! t = regexprep (xml, '<[!?][^>]*>', '');
1539%! assert (t, ' <tag v="hello">some stuff</tag>');
1540
1541%!test # Replace with non-empty
1542%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1543%! t = regexprep (xml, '<[!?][^>]*>', '?');
1544%! assert (t, '? <tag v="hello">some stuff?</tag>');
1545
1546%!test # Check that 'tokenize' is ignored
1547%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1548%! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
1549%! assert (t, ' <tag v="hello">some stuff</tag>');
1550
1551## Test capture replacement
1552%!test
1553%! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
1554%! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
1555%! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
1556%! assert (t, result);
1557
1558## Return the original if no match
1559%!assert (regexprep ('hello', 'world', 'earth'), 'hello')
1560
1561## Test emptymatch option
1562%!assert (regexprep ('World', '^', 'Hello '), 'World')
1563%!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
1564
1565## Test a general replacement
1566%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")
1567
1568## Make sure replacements work at the beginning and end of string
1569%!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
1570%!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")
1571
1572## Test options "once" and "ignorecase"
1573%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
1574%! "a_b]c{d}e-f=g")
1575%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
1576%! "a_b_c_d_e_f_g")
1577
1578## Option combinations
1579%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
1580%! "a_b]c{d}e-f=g")
1581
1582## End conditions on replacement
1583%!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
1584%!assert (regexprep ("abc", "(b)", "$1"), "abc")
1585%!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
1586%!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")
1587
1588## Test cell array arguments
1589%!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
1590%!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
1591%!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})
1592
1593## Nasty lookbehind expression
1594%!test
1595%! warning ("off", "Octave:regexp-lookbehind-limit", "local");
1596%! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\‍(\-[1-9]*\‍)',
1597%! '_minus1'),'x^(-1)+y_minus1+z_minus1=0');
1598
1599## Verify escape sequences in pattern
1600%!assert (regexprep ("\n", '\n', "X"), "X")
1601%!assert (regexprep ("\n", "\n", "X"), "X")
1602
1603## Verify NULLs in pattern and replacement string
1604%!assert (regexprep ("A\0A", "\0", ","), "A,A")
1605%!assert (regexprep ("A\0A", '\0', ","), "A,A")
1606%!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
1607%!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")
1608
1609## Empty matches were broken on ARM architecture
1610%!test <*52810>
1611%! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"),
1612%! "\nabc"));
1613*/
1614
1615OCTAVE_END_NAMESPACE(octave)
const dim_vector & dims() const
Return a const-reference so that dims ()(i) works efficiently.
Definition Array.h:507
octave_idx_type numel() const
Number of elements in the array.
Definition Array.h:418
Definition Cell.h:41
Vector representing the dimensions (size) of an Array.
Definition dim-vector.h:90
octave_idx_type numel(int n=0) const
Number of elements that a matrix with this dimensions would have.
Definition dim-vector.h:331
void assign(const std::string &k, const Cell &val)
Definition oct-map.h:344
void resize(octave_idx_type n, const octave_value &rfv=octave_value())
Definition ovl.h:115
Cell cell_value() const
Definition ovl.h:103
octave_idx_type length() const
Definition ovl.h:111
string_vector named_patterns() const
Definition lo-regexp.h:210
void dotexceptnewline(bool val)
Definition lo-regexp.h:137
void lineanchors(bool val)
Definition lo-regexp.h:140
void case_insensitive(bool val)
Definition lo-regexp.h:136
void freespacing(bool val)
Definition lo-regexp.h:139
void emptymatch(bool val)
Definition lo-regexp.h:138
void once(bool val)
Definition lo-regexp.h:141
std::string replace(const std::string &buffer, const std::string &replacement) const
Definition lo-regexp.cc:610
match_data match(const std::string &buffer) const
Definition lo-regexp.cc:327
octave_idx_type numel() const
Definition str-vec.h:98
OCTAVE_BEGIN_NAMESPACE(octave) static octave_value daspk_fcn
void print_usage()
Definition defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition defun.h:56
void warning(const char *fmt,...)
Definition error.cc:1078
void error(const char *fmt,...)
Definition error.cc:1003
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition oct-locbuf.h:44
octave_value_list ovl(const OV_Args &... args)
Construct an octave_value_list with less typing.
Definition ovl.h:217
F77_RET_T len
Definition xerbla.cc:61