GNU Octave 7.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
regexp.cc
Go to the documentation of this file.
1////////////////////////////////////////////////////////////////////////
2//
3// Copyright (C) 2002-2022 The Octave Project Developers
4//
5// See the file COPYRIGHT.md in the top-level directory of this
6// distribution or <https://octave.org/copyright/>.
7//
8// This file is part of Octave.
9//
10// Octave is free software: you can redistribute it and/or modify it
11// under the terms of the GNU General Public License as published by
12// the Free Software Foundation, either version 3 of the License, or
13// (at your option) any later version.
14//
15// Octave is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18// GNU General Public License for more details.
19//
20// You should have received a copy of the GNU General Public License
21// along with Octave; see the file COPYING. If not, see
22// <https://www.gnu.org/licenses/>.
23//
24////////////////////////////////////////////////////////////////////////
25
26#if defined (HAVE_CONFIG_H)
27# include "config.h"
28#endif
29
30#include <list>
31#include <sstream>
32
33#include <pcre.h>
34
35#include "base-list.h"
36#include "oct-locbuf.h"
37#include "quit.h"
38#include "lo-regexp.h"
39#include "str-vec.h"
40
41#include "defun.h"
42#include "Cell.h"
43#include "error.h"
44#include "errwarn.h"
45#include "oct-map.h"
46#include "ovl.h"
47#include "utils.h"
48
49OCTAVE_NAMESPACE_BEGIN
50
51// Replace backslash escapes in a string with the real values. We need
52// two special functions instead of the one in utils.cc because the set
53// of escape sequences used for regexp patterns and replacement strings
54// is different from those used in the *printf functions.
55
56static std::string
57do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
58{
59 std::string retval;
60
61 std::size_t i = 0;
62 std::size_t j = 0;
63 std::size_t len = s.length ();
64
65 retval.resize (len);
66
67 while (j < len)
68 {
69 if (s[j] == '\\' && j+1 < len)
70 {
71 switch (s[++j])
72 {
73 case 'b': // backspace
74 if (is_sq_str)
75 retval[i] = '\b';
76 else
77 {
78 // Pass escape sequence through
79 retval[i] = '\\';
80 retval[++i] = 'b';
81 }
82 break;
83
84 // Translate < and > to PCRE patterns for pseudo-word boundary
85 case '<': // begin word boundary
86 retval.insert (i, "(?<=\\W|^)");
87 i += 8;
88 break;
89
90 case '>': // end word boundary
91 retval.insert (i, "(?=\\W|$)");
92 i += 7;
93 break;
94
95 case 'o': // octal input
96 {
97 bool bad_esc_seq = (j+1 >= len);
98
99 bool brace = false;
100 if (! bad_esc_seq && s[++j] == '{')
101 {
102 brace = true;
103 j++;
104 }
105
106 int tmpi = 0;
107 std::size_t k;
108 for (k = j; k < std::min (j+3+brace, len); k++)
109 {
110 int digit = s[k] - '0';
111 if (digit < 0 || digit > 7)
112 break;
113 tmpi <<= 3;
114 tmpi += digit;
115 }
116 if (bad_esc_seq || (brace && s[k++] != '}'))
117 {
118 tmpi = 0;
119 warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
120 }
121 retval[i] = tmpi;
122 j = k - 1;
123 break;
124 }
125
126 default: // pass escape sequence through
127 retval[i] = '\\';
128 retval[++i] = s[j];
129 break;
130 }
131 }
132 else
133 {
134 retval[i] = s[j];
135 }
136
137 i++;
138 j++;
139 }
140
141 retval.resize (i);
142
143 return retval;
144}
145
146static std::string
147do_regexp_rep_string_escapes (const std::string& s)
148{
149 std::string retval;
150
151 std::size_t i = 0;
152 std::size_t j = 0;
153 std::size_t len = s.length ();
154
155 retval.resize (len);
156
157 while (j < len)
158 {
159 if (s[j] == '\\' && j+1 < len)
160 {
161 switch (s[++j])
162 {
163 case 'a': // alarm
164 retval[i] = '\a';
165 break;
166
167 case 'b': // backspace
168 retval[i] = '\b';
169 break;
170
171 case 'f': // formfeed
172 retval[i] = '\f';
173 break;
174
175 case 'n': // newline
176 retval[i] = '\n';
177 break;
178
179 case 'r': // carriage return
180 retval[i] = '\r';
181 break;
182
183 case 't': // horizontal tab
184 retval[i] = '\t';
185 break;
186
187 case 'v': // vertical tab
188 retval[i] = '\v';
189 break;
190
191 case '0':
192 case '1':
193 case '2':
194 case '3':
195 case '4':
196 case '5':
197 case '6':
198 case '7': // octal input
199 {
200 std::size_t k;
201 int tmpi = s[j] - '0';
202 for (k = j+1; k < std::min (j+3, len); k++)
203 {
204 int digit = s[k] - '0';
205 if (digit < 0 || digit > 7)
206 break;
207 tmpi <<= 3;
208 tmpi += digit;
209 }
210 retval[i] = tmpi;
211 j = k - 1;
212 break;
213 }
214
215 case 'o': // octal input
216 {
217 bool bad_esc_seq = (j+1 >= len);
218
219 bool brace = false;
220 if (! bad_esc_seq && s[++j] == '{')
221 {
222 brace = true;
223 j++;
224 }
225
226 int tmpi = 0;
227 std::size_t k;
228 for (k = j; k < std::min (j+3+brace, len); k++)
229 {
230 int digit = s[k] - '0';
231 if (digit < 0 || digit > 7)
232 break;
233 tmpi <<= 3;
234 tmpi += digit;
235 }
236 if (bad_esc_seq || (brace && s[k++] != '}'))
237 {
238 warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
239 tmpi = 0;
240 }
241 retval[i] = tmpi;
242 j = k - 1;
243 break;
244 }
245
246 case 'x': // hex input
247 {
248 bool bad_esc_seq = (j+1 >= len);
249
250 bool brace = false;
251 if (! bad_esc_seq && s[++j] == '{')
252 {
253 brace = true;
254 j++;
255 }
256
257 int tmpi = 0;
258 std::size_t k;
259 for (k = j; k < std::min (j+2+brace, len); k++)
260 {
261 if (! isxdigit (s[k]))
262 break;
263
264 tmpi <<= 4;
265 int digit = s[k];
266 if (digit >= 'a')
267 tmpi += digit - 'a' + 10;
268 else if (digit >= 'A')
269 tmpi += digit - 'A' + 10;
270 else
271 tmpi += digit - '0';
272 }
273 if (bad_esc_seq || (brace && s[k++] != '}'))
274 {
275 warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
276 tmpi = 0;
277 }
278 retval[i] = tmpi;
279 j = k - 1;
280 break;
281 }
282
283 // Both dollar sign (for capture buffer) and backslash are
284 // passed through with their escape backslash. The processing
285 // for these must occur during the actual replacement operation
286 // in lo-regexp.cc.
287 case '$': // pass dollar sign through with escape
288 retval[i] = '\\'; retval[++i] = '$';
289 break;
290
291 case '\\': // pass backslash through with escape
292 retval[i] = '\\'; retval[++i] = '\\';
293 break;
294
295 default: // convert escaped character to unescaped char
296 retval[i] = s[j];
297 break;
298 }
299 }
300 else
301 {
302 retval[i] = s[j];
303 }
304
305 i++;
306 j++;
307 }
308
309 retval.resize (i);
310
311 return retval;
312}
313
314static void
315parse_options (regexp::opts& options, const octave_value_list& args,
316 const std::string& who, int skip, bool& extra_args)
317{
318 extra_args = false;
319
320 for (int i = skip; i < args.length (); i++)
321 {
322 std::string str;
323
324 str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());
325
326 std::transform (str.begin (), str.end (), str.begin (), tolower);
327
328 if (str.find ("once", 0) == 0)
329 options.once (true);
330 else if (str.find ("matchcase", 0) == 0)
331 options.case_insensitive (false);
332 else if (str.find ("ignorecase", 0) == 0)
333 options.case_insensitive (true);
334 else if (str.find ("dotall", 0) == 0)
335 options.dotexceptnewline (false);
336 else if (str.find ("stringanchors", 0) == 0)
337 options.lineanchors (false);
338 else if (str.find ("literalspacing", 0) == 0)
339 options.freespacing (false);
340 else if (str.find ("noemptymatch", 0) == 0)
341 options.emptymatch (false);
342 else if (str.find ("dotexceptnewline", 0) == 0)
343 options.dotexceptnewline (true);
344 else if (str.find ("lineanchors", 0) == 0)
345 options.lineanchors (true);
346 else if (str.find ("freespacing", 0) == 0)
347 options.freespacing (true);
348 else if (str.find ("emptymatch", 0) == 0)
349 options.emptymatch (true);
350 else if (str.find ("start", 0) == 0
351 || str.find ("end", 0) == 0
352 || str.find ("tokenextents", 0) == 0
353 || str.find ("match", 0) == 0
354 || str.find ("tokens", 0) == 0
355 || str.find ("names", 0) == 0
356 || str.find ("split", 0) == 0)
357 extra_args = true;
358 else
359 error ("%s: unrecognized option", who.c_str ());
360 }
361}
362
364octregexp (const octave_value_list& args, int nargout,
365 const std::string& who, bool case_insensitive = false)
366{
367 octave_value_list retval;
368
369 int nargin = args.length ();
370
371 // Make sure we have string, pattern
372 const std::string buffer = args(0).string_value ();
373
374 std::string pattern = args(1).string_value ();
375
376 // Rewrite pattern for PCRE
377 pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
378
379 regexp::opts options;
380 options.case_insensitive (case_insensitive);
381 bool extra_options = false;
382 parse_options (options, args, who, 2, extra_options);
383
384 const regexp::match_data rx_lst
385 = regexp::match (pattern, buffer, options, who);
386
387 string_vector named_pats = rx_lst.named_patterns ();
388
389 std::size_t sz = rx_lst.size ();
390
391 // Converted the linked list in the correct form for the return values
392
393 octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);
394
395 retval.resize (7);
396
397 if (sz != 0)
398 {
399 for (int j = 0; j < named_pats.numel (); j++)
400 {
401 Cell ctmp (dim_vector (1, sz));
402 octave_idx_type i = 0;
403
404 for (const auto& match_data : rx_lst)
405 {
406 string_vector named_tokens = match_data.named_tokens ();
407
408 ctmp(i++) = named_tokens(j);
409 }
410
411 nmap.assign (named_pats(j), ctmp);
412 }
413 }
414 retval(5) = nmap;
415
416 if (options.once ())
417 {
418 auto p = rx_lst.begin ();
419
420 retval(4) = (sz ? p->tokens () : Cell ());
421 retval(3) = (sz ? p->match_string () : "");
422 retval(2) = (sz ? p->token_extents () : Matrix ());
423
424 if (sz)
425 {
426 double start = p->start ();
427 double end = p->end ();
428
429 Cell split (dim_vector (1, 2));
430 split(0) = buffer.substr (0, start-1);
431 split(1) = buffer.substr (end);
432
433 retval(6) = split;
434 retval(1) = end;
435 retval(0) = start;
436 }
437 else
438 {
439 retval(6) = buffer;
440 retval(1) = Matrix ();
441 retval(0) = Matrix ();
442 }
443 }
444 else
445 {
446 Cell tokens (dim_vector (1, sz));
447 Cell match_string (dim_vector (1, sz));
448 Cell token_extents (dim_vector (1, sz));
449 NDArray end (dim_vector (1, sz));
450 NDArray start (dim_vector (1, sz));
451 Cell split (dim_vector (1, sz+1));
452 std::size_t sp_start = 0;
453
454 octave_idx_type i = 0;
455 for (const auto& match_data : rx_lst)
456 {
457 double s = match_data.start ();
458 double e = match_data.end ();
459
460 string_vector tmp = match_data.tokens ();
461 tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
462 match_string(i) = match_data.match_string ();
463 token_extents(i) = match_data.token_extents ();
464 end(i) = e;
465 start(i) = s;
466 split(i) = buffer.substr (sp_start, s-sp_start-1);
467 sp_start = e;
468 i++;
469 }
470
471 split(i) = buffer.substr (sp_start);
472
473 retval(6) = split;
474 retval(4) = tokens;
475 retval(3) = match_string;
476 retval(2) = token_extents;
477 retval(1) = end;
478 retval(0) = start;
479 }
480
481 // Alter the order of the output arguments
482
483 if (extra_options)
484 {
485 int n = 0;
486 octave_value_list new_retval;
487 new_retval.resize (nargout);
488
489 bool arg_used[7] {};
490
491 for (int j = 2; j < nargin; j++)
492 {
493 int k = 0;
494 std::string str = args(j).string_value ();
495 std::transform (str.begin (), str.end (), str.begin (), tolower);
496
497 if (str.find ("once", 0) == 0
498 || str.find ("stringanchors", 0) == 0
499 || str.find ("lineanchors", 0) == 0
500 || str.find ("matchcase", 0) == 0
501 || str.find ("ignorecase", 0) == 0
502 || str.find ("dotall", 0) == 0
503 || str.find ("dotexceptnewline", 0) == 0
504 || str.find ("literalspacing", 0) == 0
505 || str.find ("freespacing", 0) == 0
506 || str.find ("noemptymatch", 0) == 0
507 || str.find ("emptymatch", 0) == 0)
508 continue;
509 else if (str.find ("start", 0) == 0)
510 k = 0;
511 else if (str.find ("end", 0) == 0)
512 k = 1;
513 else if (str.find ("tokenextents", 0) == 0)
514 k = 2;
515 else if (str.find ("match", 0) == 0)
516 k = 3;
517 else if (str.find ("tokens", 0) == 0)
518 k = 4;
519 else if (str.find ("names", 0) == 0)
520 k = 5;
521 else if (str.find ("split", 0) == 0)
522 k = 6;
523
524 new_retval(n++) = retval(k);
525 arg_used[k] = true;
526
527 if (n == nargout)
528 break;
529 }
530
531 // Fill in the rest of the arguments
532 if (n < nargout)
533 {
534 for (int j = 0; j < 7; j++)
535 {
536 if (! arg_used[j])
537 new_retval(n++) = retval(j);
538 }
539 }
540
541 retval = new_retval;
542 }
543
544 return retval;
545}
546
548octcellregexp (const octave_value_list& args, int nargout,
549 const std::string& who, bool case_insensitive = false)
550{
551 octave_value_list retval;
552
553 if (args(0).iscell ())
554 {
555 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
556 octave_value_list new_args = args;
557 Cell cellstr = args(0).cell_value ();
558 if (args(1).iscell ())
559 {
560 Cell cellpat = args(1).cell_value ();
561
562 if (cellpat.numel () == 1)
563 {
564 for (int j = 0; j < nargout; j++)
565 newretval[j].resize (cellstr.dims ());
566
567 new_args(1) = cellpat(0);
568
569 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
570 {
571 new_args(0) = cellstr(i);
572 octave_value_list tmp = octregexp (new_args, nargout, who,
573 case_insensitive);
574
575 for (int j = 0; j < nargout; j++)
576 newretval[j](i) = tmp(j);
577 }
578 }
579 else if (cellstr.numel () == 1)
580 {
581 for (int j = 0; j < nargout; j++)
582 newretval[j].resize (cellpat.dims ());
583
584 new_args(0) = cellstr(0);
585
586 for (octave_idx_type i = 0; i < cellpat.numel (); i++)
587 {
588 new_args(1) = cellpat(i);
589 octave_value_list tmp = octregexp (new_args, nargout, who,
590 case_insensitive);
591
592 for (int j = 0; j < nargout; j++)
593 newretval[j](i) = tmp(j);
594 }
595 }
596 else if (cellstr.numel () == cellpat.numel ())
597 {
598 if (cellstr.dims () != cellpat.dims ())
599 error ("%s: inconsistent cell array dimensions", who.c_str ());
600
601 for (int j = 0; j < nargout; j++)
602 newretval[j].resize (cellstr.dims ());
603
604 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
605 {
606 new_args(0) = cellstr(i);
607 new_args(1) = cellpat(i);
608
609 octave_value_list tmp = octregexp (new_args, nargout, who,
610 case_insensitive);
611
612 for (int j = 0; j < nargout; j++)
613 newretval[j](i) = tmp(j);
614 }
615 }
616 else
617 error ("regexp: cell array arguments must be scalar or equal size");
618 }
619 else
620 {
621 for (int j = 0; j < nargout; j++)
622 newretval[j].resize (cellstr.dims ());
623
624 for (octave_idx_type i = 0; i < cellstr.numel (); i++)
625 {
626 new_args(0) = cellstr(i);
627 octave_value_list tmp = octregexp (new_args, nargout, who,
628 case_insensitive);
629
630 for (int j = 0; j < nargout; j++)
631 newretval[j](i) = tmp(j);
632 }
633 }
634
635 for (int j = 0; j < nargout; j++)
636 retval(j) = octave_value (newretval[j]);
637 }
638 else if (args(1).iscell ())
639 {
640 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
641 octave_value_list new_args = args;
642 Cell cellpat = args(1).cell_value ();
643
644 for (int j = 0; j < nargout; j++)
645 newretval[j].resize (cellpat.dims ());
646
647 for (octave_idx_type i = 0; i < cellpat.numel (); i++)
648 {
649 new_args(1) = cellpat(i);
650 octave_value_list tmp = octregexp (new_args, nargout, who,
651 case_insensitive);
652
653 for (int j = 0; j < nargout; j++)
654 newretval[j](i) = tmp(j);
655 }
656
657 for (int j = 0; j < nargout; j++)
658 retval(j) = octave_value (newretval[j]);
659 }
660 else
661 retval = octregexp (args, nargout, who, case_insensitive);
662
663 return retval;
664
665}
666
667DEFUN (regexp, args, nargout,
668 doc: /* -*- texinfo -*-
669@deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
670@deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
671Regular expression string matching.
672
673Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
674substrings of any matches, or empty values if there are none.
675
676The matched pattern @var{pat} can include any of the standard regex
677operators, including:
678
679@table @code
680@item .
681Match any character
682
683@item * + ? @{@}
684Repetition operators, representing
685
686@table @code
687@item *
688Match zero or more times
689
690@item +
691Match one or more times
692
693@item ?
694Match zero or one times
695
696@item @{@var{n}@}
697Match exactly @var{n} times
698
699@item @{@var{n},@}
700Match @var{n} or more times
701
702@item @{@var{m},@var{n}@}
703Match between @var{m} and @var{n} times
704@end table
705
706@item [@dots{}] [^@dots{}]
707
708List operators. The pattern will match any character listed between
709@qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the
710pattern is inverted and any character except those listed between brackets
711will match.
712
713Escape sequences defined below can also be used inside list operators. For
714example, a template for a floating point number might be @code{[-+.\d]+}.
715
716@item () (?:)
717Grouping operator. The first form, parentheses only, also creates a token.
718
719@item |
720Alternation operator. Match one of a choice of regular expressions. The
721alternatives must be delimited by the grouping operator @code{()} above.
722
723@item ^ $
724Anchoring operators. Requires pattern to occur at the start (@code{^}) or
725end (@code{$}) of the string.
726@end table
727
728In addition, the following escaped characters have special meaning.
729
730@table @code
731
732@item \d
733Match any digit
734
735@item \D
736Match any non-digit
737
738@item \s
739Match any whitespace character
740
741@item \S
742Match any non-whitespace character
743
744@item \w
745Match any word character
746
747@item \W
748Match any non-word character
749
750@item <
751Match the beginning of a word
752
753@item >
754Match the end of a word
755
756@item \B
757Match within a word
758@end table
759
760Implementation Note: For compatibility with @sc{matlab}, escape sequences
761in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
762even when @var{pat} has been defined with single quotes. To disable
763expansion use a second backslash before the escape sequence (e.g.,
764"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
765function.
766
767The outputs of @code{regexp} default to the order given below
768
769@table @var
770@item s
771The start indices of each matching substring
772
773@item e
774The end indices of each matching substring
775
776@item te
777The extents of each matched token surrounded by @code{(@dots{})} in
778@var{pat}
779
780@item m
781A cell array of the text of each match
782
783@item t
784A cell array of the text of each token matched
785
786@item nm
787A structure containing the text of each matched named token, with the name
788being used as the fieldname. A named token is denoted by
789@code{(?<name>@dots{})}.
790
791@item sp
792A cell array of the text not returned by match, i.e., what remains if you
793split the string based on @var{pat}.
794@end table
795
796Particular output arguments, or the order of the output arguments, can be
797selected by additional @var{opt} arguments. These are strings and the
798correspondence between the output arguments and the optional argument
799are
800
801@multitable @columnfractions 0.2 0.3 0.3 0.2
802@item @tab @qcode{'start'} @tab @var{s} @tab
803@item @tab @qcode{'end'} @tab @var{e} @tab
804@item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
805@item @tab @qcode{'match'} @tab @var{m} @tab
806@item @tab @qcode{'tokens'} @tab @var{t} @tab
807@item @tab @qcode{'names'} @tab @var{nm} @tab
808@item @tab @qcode{'split'} @tab @var{sp} @tab
809@end multitable
810
811Additional arguments are summarized below.
812
813@table @samp
814@item once
815Return only the first occurrence of the pattern.
816
817@item matchcase
818Make the matching case sensitive. (default)
819
820Alternatively, use (?-i) in the pattern.
821
822@item ignorecase
823Ignore case when matching the pattern to the string.
824
825Alternatively, use (?i) in the pattern.
826
827@item stringanchors
828Match the anchor characters at the beginning and end of the string.
829(default)
830
831Alternatively, use (?-m) in the pattern.
832
833@item lineanchors
834Match the anchor characters at the beginning and end of the line.
835
836Alternatively, use (?m) in the pattern.
837
838@item dotall
839The pattern @code{.} matches all characters including the newline character.
840 (default)
841
842Alternatively, use (?s) in the pattern.
843
844@item dotexceptnewline
845The pattern @code{.} matches all characters except the newline character.
846
847Alternatively, use (?-s) in the pattern.
848
849@item literalspacing
850All characters in the pattern, including whitespace, are significant and are
851used in pattern matching. (default)
852
853Alternatively, use (?-x) in the pattern.
854
855@item freespacing
856The pattern may include arbitrary whitespace and also comments beginning
857with the character @samp{#}.
858
859Alternatively, use (?x) in the pattern.
860
861@item noemptymatch
862Zero-length matches are not returned. (default)
863
864@item emptymatch
865Return zero-length matches.
866
867@code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
868are zero or more @qcode{'b'} characters at positions 1 and end-of-string.
869
870@end table
871
872Stack Limitation Note: Pattern searches are done with a recursive function
873which can overflow the program stack when there are a high number of matches.
874For example,
875
876@example
877@code{regexp (repmat ('a', 1, 1e5), '(a)+')}
878@end example
879
880@noindent
881may lead to a segfault. As an alternative, consider constructing pattern
882searches that reduce the number of matches (e.g., by creatively using set
883complement), and then further processing the return variables (now reduced in
884size) with successive @code{regexp} searches.
885@seealso{regexpi, strfind, regexprep}
886@end deftypefn */)
887{
888 if (args.length () < 2)
889 print_usage ();
890
891 octave_value_list retval;
892
893 if (args(0).iscell () || args(1).iscell ())
894 retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
895 else
896 retval = octregexp (args, nargout, "regexp");
897
898 return retval;
899}
900
901/*
902## PCRE_ERROR_MATCHLIMIT test
903%!test
904%! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
905%! ws = warning ("query");
906%! unwind_protect
907%! warning ("off");
908%! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
909%! unwind_protect_cleanup
910%! warning (ws);
911%! end_unwind_protect
912
913## segfault test
914%!assert (regexp ("abcde", "."), [1,2,3,4,5])
915## Infinite loop test
916%!assert (isempty (regexp ("abcde", "")))
917
918## Check that anchoring of pattern works correctly
919%!assert (regexp ('abcabc', '^abc'), 1)
920%!assert (regexp ('abcabc', 'abc$'), 4)
921%!assert (regexp ('abcabc', '^abc$'), zeros (1,0))
922
923%!test
924%! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
925%! assert (s, zeros (1,0));
926%! assert (e, zeros (1,0));
927%! assert (te, cell (1,0));
928%! assert (m, cell (1,0));
929%! assert (t, cell (1,0));
930
931%!test
932%! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
933%! assert (s, zeros (1,0));
934%! assert (e, zeros (1,0));
935%! assert (te, cell (1,0));
936%! assert (m, cell (1,0));
937%! assert (t, cell (1,0));
938
939%!test
940%! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
941%! assert (s, 2);
942%! assert (e, 10);
943%! assert (te{1}, [3, 7]);
944%! assert (m{1}, 'firetruck');
945%! assert (t{1}{1}, 'iretr');
946
947%!test
948%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
949%! assert (s, [1, 12]);
950%! assert (e, [5, 17]);
951%! assert (size (te), [1, 2]);
952%! assert (isempty (te{1}));
953%! assert (isempty (te{2}));
954%! assert (m{1}, 'short');
955%! assert (m{2}, 'string');
956%! assert (size (t), [1, 2]);
957%! assert (isempty (t{1}));
958%! assert (isempty (t{2}));
959
960%!test
961%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
962%! assert (s, 1);
963%! assert (e, 5);
964%! assert (isempty (te));
965%! assert (m, 'short');
966%! assert (isempty (t));
967
968%!test
969%! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
970%! assert (s, 1);
971%! assert (e, 5);
972%! assert (isempty (te));
973%! assert (m, 'short');
974%! assert (isempty (t));
975
976%!test
977%! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
978%! assert (s, 1);
979%! assert (e, 10);
980%! assert (size (te), [1, 1]);
981%! assert (te{1}, [1,5; 7,10]);
982%! assert (m{1}, 'short test');
983%! assert (size (t), [1, 1]);
984%! assert (t{1}{1}, 'short');
985%! assert (t{1}{2}, 'test');
986%! assert (size (nm), [1, 1]);
987%! assert (! isempty (fieldnames (nm)));
988%! assert (sort (fieldnames (nm)), {'word1';'word2'});
989%! assert (nm.word1, 'short');
990%! assert (nm.word2, 'test');
991
992%!test
993%! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
994%! assert (s, 1);
995%! assert (e, 10);
996%! assert (size (te), [1, 1]);
997%! assert (te{1}, [1,5; 7,10]);
998%! assert (m{1}, 'short test');
999%! assert (size (t), [1, 1]);
1000%! assert (t{1}{1}, 'short');
1001%! assert (t{1}{2}, 'test');
1002%! assert (size (nm), [1, 1]);
1003%! assert (! isempty (fieldnames (nm)));
1004%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1005%! assert (nm.word1, 'short');
1006%! assert (nm.word2, 'test');
1007
1008%!test
1009%! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
1010%! assert (size (t), [1, 2]);
1011%! assert (t{1}{1}, "John");
1012%! assert (t{1}{2}, "Davis");
1013%! assert (t{2}{1}, "Rogers");
1014%! assert (t{2}{2}, "James");
1015%! assert (size (nm), [1, 2]);
1016%! assert (nm(1).first, "John");
1017%! assert (nm(1).last, "Davis");
1018%! assert (nm(2).first, "James");
1019%! assert (nm(2).last, "Rogers");
1020
1021## Tests for nulls in strings properly matching
1022%!test
1023%! str = "A\0B\0\0C";
1024%! ptn = '(\0+)'; # also test null in single-quote pattern
1025%! M = regexp (str, ptn, "match");
1026%! assert (size (M), [1, 2]);
1027%! assert (double (M{1}), [0]);
1028%! assert (double (M{2}), [0, 0]);
1029
1030%!test
1031%! str = "A\0B\0\0C";
1032%! ptn = "(\0+)"; # also test null in double-quote pattern
1033%! T = regexp (str, ptn, "tokens");
1034%! assert (size (T), [1, 2]);
1035%! assert (double (T{1}{1}), [0]);
1036%! assert (double (T{2}{1}), [0, 0]);
1037
1038%!test
1039%! str = "A\0B\0\0C";
1040%! ptn = '(?<namedtoken>\0+)';
1041%! NT = regexp (str, ptn, "names");
1042%! assert (size (NT), [1, 2]);
1043%! assert (double (NT(1).namedtoken), [0]);
1044%! assert (double (NT(2).namedtoken), [0, 0]);
1045
1046## Tests for named tokens
1047%!test
1048%! ## Parenthesis in named token (ie (int)) causes a problem
1049%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
1050%! struct ('typestr', 'int'));
1051
1052%!test <*35683>
1053%! ## Mix of named and unnamed tokens can cause segfault
1054%! str = "abcde";
1055%! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
1056%! tokens = regexp (str, ptn, "names");
1057%! assert (isstruct (tokens) && numel (tokens) == 1);
1058%! assert (tokens.T1, "a");
1059%! assert (tokens.T2, "de");
1060
1061## Test options to regexp
1062%!assert (regexp ("abc\nabc", '.'), [1:7])
1063%!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
1064%!test
1065%! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
1066%! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1067%! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1068
1069%!assert (regexp ("caseCaSe", 'case'), 1)
1070%!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
1071%!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
1072%!test
1073%! assert (regexp ("caseCaSe", '(?-i)case'), 1);
1074%! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);
1075
1076%!assert (regexp ("abc\nabc", 'c$'), 7)
1077%!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
1078%!test
1079%! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
1080%! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
1081%! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);
1082
1083%!assert (regexp ("this word", 's w'), 4)
1084%!assert (regexp ("this word", 's w', 'literalspacing'), 4)
1085%!test
1086%! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
1087%! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
1088%! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
1089
1090%!test
1091%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
1092%! assert (s, [1 5]);
1093%! assert (e, [3 5]);
1094%! assert (te, { zeros(0,2), zeros(0,2) });
1095%! assert (m, { "OCT", "V" });
1096%! assert (t, { cell(1,0), cell(1,0) });
1097%! assert (isempty (fieldnames (nm)));
1098%! assert (sp, { "", "A", "E" });
1099
1100%!test
1101%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
1102%! assert (s, [1 5]);
1103%! assert (e, [3 5]);
1104%! assert (te, { [1 3], [5 5] });
1105%! assert (m, { "OCT", "V" });
1106%! assert (t, { {"OCT"}, {"V"} });
1107%! assert (isempty (fieldnames (nm)));
1108%! assert (sp, { "", "A", "E" });
1109
1110%!test
1111%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
1112%! assert (s, [1 4 5 6 7]);
1113%! assert (e, [3 3 5 5 6]);
1114%! assert (te, repmat ({zeros(0,2)}, [1, 5]));
1115%! assert (m, { "OCT", "", "V", "", "" });
1116%! assert (t, repmat({cell(1,0)}, [1, 5]));
1117%! assert (isempty (fieldnames (nm)));
1118%! assert (sp, { "", "", "A", "", "E", "" });
1119
1120%!test
1121%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
1122%! assert (s, [1 4 5 6 7]);
1123%! assert (e, [3 3 5 5 6]);
1124%! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
1125%! assert (m, { "OCT", "", "V", "", "" });
1126%! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
1127%! assert (isempty (fieldnames (nm)));
1128%! assert (sp, { "", "", "A", "", "E", "" });
1129
1130%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1,0)})
1131%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]})
1132%!assert (regexp ('Strings', {'t','s'}), {2, 7})
1133
1134## Test case for lookaround operators
1135%!test
1136%! assert (regexp ('Iraq', 'q(?!u)'), 4);
1137%! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
1138%! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
1139%! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
1140%! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0));
1141%! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'});
1142%! assert (regexp ('thingamabob', '(?<=a)b'), 9);
1143
1144## Tests for split option.
1145%!shared str
1146%! str = "foo bar foo";
1147%!test
1148%! [a, b] = regexp (str, "f..", "match", "split");
1149%! assert (a, {"foo", "foo"});
1150%! assert (b, {"", " bar ", ""});
1151%!test
1152%! [a, b] = regexp (str, "f..", "match", "split", "once");
1153%! assert (a, "foo");
1154%! assert (b, {"", " bar foo"});
1155%!test
1156%! [a, b] = regexp (str, "fx.", "match", "split");
1157%! assert (a, cell (1, 0));
1158%! assert (b, {"foo bar foo"});
1159%!test
1160%! [a, b] = regexp (str, "fx.", "match", "split", "once");
1161%! assert (a, "");;
1162%! assert (b, "foo bar foo");
1163
1164%!shared str
1165%! str = "foo bar";
1166%!test
1167%! [a, b] = regexp (str, "f..", "match", "split");
1168%! assert (a, {"foo"});
1169%! assert (b, {"", " bar"});
1170%!test
1171%! [a, b] = regexp (str, "b..", "match", "split");
1172%! assert (a, {"bar"});
1173%! assert (b, {"foo ", ""});
1174%!test
1175%! [a, b] = regexp (str, "x", "match", "split");
1176%! assert (a, cell (1, 0));
1177%! assert (b, {"foo bar"});
1178%!test
1179%! [a, b] = regexp (str, "[o]+", "match", "split");
1180%! assert (a, {"oo"});
1181%! assert (b, {"f", " bar"});
1182
1183## Test escape sequences are expanded even in single-quoted strings
1184%!assert (regexp ("\n", '\n'), 1)
1185%!assert (regexp ("\n", "\n"), 1)
1186
1187## Test escape sequences are silently converted
1188%!test <*45407>
1189%! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
1190%! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
1191%! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
1192
1193## Test start-of-word / end-of-word patterns for Matlab compatibility
1194%!test <*59992>
1195%! assert (regexp ('foo!+bar', '<\w'), [1, 6]);
1196%! assert (regexp ('foo!+bar', '.>'), [3, 4, 8]);
1197%! assert (regexp ('foo!+bar\nbar!+foo', '.>'), [3, 4, 8, 13, 14, 18]);
1198%! assert (regexp ('foo!+bar\nbar!+foo', '<\w'), [1, 6, 10, 16]);
1199
1200## Test input validation
1201%!error regexp ('string', 'tri', 'BadArg')
1202%!error regexp ('string')
1203
1204*/
1205
1206DEFUN (regexpi, args, nargout,
1207 doc: /* -*- texinfo -*-
1208@deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
1209@deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})
1210
1211Case insensitive regular expression string matching.
1212
1213Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
1214substrings of any matches, or empty values if there are none.
1215@xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search
1216pattern.
1217@seealso{regexp}
1218@end deftypefn */)
1219{
1220 if (args.length () < 2)
1221 print_usage ();
1222
1223 if (args(0).iscell () || args(1).iscell ())
1224 return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
1225 else
1226 return octregexp (args, nargout, "regexpi", true);
1227}
1228
1229/*
1230## segfault test
1231%!assert (regexpi ("abcde", "."), [1,2,3,4,5])
1232
1233## Check that anchoring of pattern works correctly
1234%!assert (regexpi ('abcabc', '^ABC'), 1)
1235%!assert (regexpi ('abcabc', 'ABC$'), 4)
1236%!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))
1237
1238%!test
1239%! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
1240%! assert (s, zeros (1,0));
1241%! assert (e, zeros (1,0));
1242%! assert (te, cell (1,0));
1243%! assert (m, cell (1,0));
1244%! assert (t, cell (1,0));
1245
1246%!test
1247%! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
1248%! assert (s, 2);
1249%! assert (e, 10);
1250%! assert (te{1}, [3, 7]);
1251%! assert (m{1}, 'FiRetrUck');
1252%! assert (t{1}{1}, 'iRetr');
1253
1254%!test
1255%! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
1256%! assert (s, 2);
1257%! assert (e, 10);
1258%! assert (te{1}, [3, 7]);
1259%! assert (m{1}, 'firetruck');
1260%! assert (t{1}{1}, 'iretr');
1261
1262%!test
1263%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
1264%! assert (s, [1, 12]);
1265%! assert (e, [5, 17]);
1266%! assert (size (te), [1, 2]);
1267%! assert (isempty (te{1}));
1268%! assert (isempty (te{2}));
1269%! assert (m{1}, 'ShoRt');
1270%! assert (m{2}, 'String');
1271%! assert (size (t), [1, 2]);
1272%! assert (isempty (t{1}));
1273%! assert (isempty (t{2}));
1274
1275%!test
1276%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
1277%! assert (s, 1);
1278%! assert (e, 5);
1279%! assert (isempty (te));
1280%! assert (m, 'ShoRt');
1281%! assert (isempty (t));
1282
1283%!test
1284%! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1285%! assert (s, 1);
1286%! assert (e, 5);
1287%! assert (isempty (te));
1288%! assert (m, 'ShoRt');
1289%! assert (isempty (t));
1290
1291%!test
1292%! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
1293%! assert (s, 1);
1294%! assert (e, 10);
1295%! assert (size (te), [1, 1]);
1296%! assert (te{1}, [1,5; 7,10]);
1297%! assert (m{1}, 'ShoRt Test');
1298%! assert (size (t), [1, 1]);
1299%! assert (t{1}{1}, 'ShoRt');
1300%! assert (t{1}{2}, 'Test');
1301%! assert (size (nm), [1, 1]);
1302%! assert (! isempty (fieldnames (nm)));
1303%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1304%! assert (nm.word1, 'ShoRt');
1305%! assert (nm.word2, 'Test');
1306
1307%!test
1308%! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1309%! assert (s, 1);
1310%! assert (e, 10);
1311%! assert (size (te), [1, 1]);
1312%! assert (te{1}, [1,5; 7,10]);
1313%! assert (m{1}, 'ShoRt Test');
1314%! assert (size (t), [1, 1]);
1315%! assert (t{1}{1}, 'ShoRt');
1316%! assert (t{1}{2}, 'Test');
1317%! assert (size (nm), [1, 1]);
1318%! assert (! isempty (fieldnames (nm)));
1319%! assert (sort (fieldnames (nm)), {'word1';'word2'});
1320%! assert (nm.word1, 'ShoRt');
1321%! assert (nm.word2, 'Test');
1322
1323%!assert (regexpi ("abc\nabc", '.'), [1:7])
1324%!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
1325%!test
1326%! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
1327%! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1328%! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1329
1330%!assert (regexpi ("caseCaSe", 'case'), [1, 5])
1331%!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
1332%!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
1333%!test
1334%! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
1335%! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);
1336
1337%!assert (regexpi ("abc\nabc", 'C$'), 7)
1338%!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
1339%!test
1340%! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
1341%! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
1342%! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);
1343
1344%!assert (regexpi ("this word", 'S w'), 4)
1345%!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
1346%!test
1347%! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
1348%! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
1349%! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));
1350
1351%!error regexpi ('string', 'tri', 'BadArg')
1352%!error regexpi ('string')
1353
1354%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1, 0)})
1355%!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'), {6, [1,5,9], zeros(1,0)})
1356%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]})
1357%!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})
1358
1359%!assert (regexpi ("\n", '\n'), 1)
1360%!assert (regexpi ("\n", "\n"), 1)
1361*/
1362
1363static octave_value
1364octregexprep (const octave_value_list& args, const std::string& who)
1365{
1366 int nargin = args.length ();
1367
1368 // Make sure we have string, pattern, replacement
1369 const std::string buffer = args(0).string_value ();
1370
1371 std::string pattern = args(1).string_value ();
1372
1373 // Rewrite pattern for PCRE
1374 pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
1375
1376 std::string replacement = args(2).string_value ();
1377
1378 // Matlab compatibility.
1379 if (args(2).is_sq_string ())
1380 replacement = do_regexp_rep_string_escapes (replacement);
1381
1382 // Pack options excluding 'tokenize' and various output
1383 // reordering strings into regexp arg list
1384 octave_value_list regexpargs (nargin-3, octave_value ());
1385
1386 int len = 0;
1387 for (int i = 3; i < nargin; i++)
1388 {
1389 const std::string opt = args(i).string_value ();
1390 if (opt != "tokenize" && opt != "start" && opt != "end"
1391 && opt != "tokenextents" && opt != "match" && opt != "tokens"
1392 && opt != "names" && opt != "split" && opt != "warnings")
1393 {
1394 regexpargs(len++) = args(i);
1395 }
1396 }
1397 regexpargs.resize (len);
1398
1399 regexp::opts options;
1400 bool extra_args = false;
1401 parse_options (options, regexpargs, who, 0, extra_args);
1402
1403 return regexp::replace (pattern, buffer, replacement, options, who);
1404}
1405
1406DEFUN (regexprep, args, ,
1407 doc: /* -*- texinfo -*-
1408@deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
1409@deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
1410Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.
1411
1412The pattern is a regular expression as documented for @code{regexp}.
1413@xref{XREFregexp,,@code{regexp}}.
1414
1415All strings must be UTF-8 encoded.
1416
1417The replacement string may contain @code{$i}, which substitutes for the ith
1418set of parentheses in the match string. For example,
1419
1420@example
1421regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
1422@end example
1423
1424@noindent
1425returns @qcode{"Dunn, Bill"}
1426
1427Options in addition to those of @code{regexp} are
1428
1429@table @samp
1430
1431@item once
1432Replace only the first occurrence of @var{pat} in the result.
1433
1434@item warnings
1435This option is present for compatibility but is ignored.
1436
1437@end table
1438
1439Implementation Note: For compatibility with @sc{matlab}, escape sequences
1440in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
1441even when @var{pat} has been defined with single quotes. To disable
1442expansion use a second backslash before the escape sequence (e.g.,
1443"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
1444function.
1445@seealso{regexp, regexpi, strrep}
1446@end deftypefn */)
1447{
1448 if (args.length () < 3)
1449 print_usage ();
1450
1451 octave_value_list retval;
1452
1453 if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
1454 {
1455 Cell str, pat, rep;
1456 dim_vector dv0;
1457 dim_vector dv1 (1, 1);
1458
1459 if (args(0).iscell ())
1460 str = args(0).cell_value ();
1461 else
1462 str = Cell (args(0));
1463
1464 if (args(1).iscell ())
1465 pat = args(1).cell_value ();
1466 else
1467 pat = Cell (args(1));
1468
1469 if (args(2).iscell ())
1470 rep = args(2).cell_value ();
1471 else
1472 rep = Cell (args(2));
1473
1474 dv0 = str.dims ();
1475 if (pat.numel () != 1)
1476 {
1477 dv1 = pat.dims ();
1478 if (rep.numel () != 1 && dv1 != rep.dims ())
1479 error ("regexprep: inconsistent cell array dimensions");
1480 }
1481 else if (rep.numel () != 1)
1482 dv1 = rep.dims ();
1483
1484 Cell ret (dv0);
1485 octave_value_list new_args = args;
1486
1487 for (octave_idx_type i = 0; i < dv0.numel (); i++)
1488 {
1489 new_args(0) = str(i);
1490 if (pat.numel () == 1)
1491 new_args(1) = pat(0);
1492 if (rep.numel () == 1)
1493 new_args(2) = rep(0);
1494
1495 for (octave_idx_type j = 0; j < dv1.numel (); j++)
1496 {
1497 if (pat.numel () != 1)
1498 new_args(1) = pat(j);
1499 if (rep.numel () != 1)
1500 new_args(2) = rep(j);
1501 new_args(0) = octregexprep (new_args, "regexprep");
1502 }
1503
1504 ret(i) = new_args(0);
1505 }
1506
1507 retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
1508 }
1509 else
1510 retval = octregexprep (args, "regexprep");
1511
1512 return retval;
1513}
1514
1515/*
1516%!test # Replace with empty
1517%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1518%! t = regexprep (xml, '<[!?][^>]*>', '');
1519%! assert (t, ' <tag v="hello">some stuff</tag>');
1520
1521%!test # Replace with non-empty
1522%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1523%! t = regexprep (xml, '<[!?][^>]*>', '?');
1524%! assert (t, '? <tag v="hello">some stuff?</tag>');
1525
1526%!test # Check that 'tokenize' is ignored
1527%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1528%! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
1529%! assert (t, ' <tag v="hello">some stuff</tag>');
1530
1531## Test capture replacement
1532%!test
1533%! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
1534%! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
1535%! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
1536%! assert (t, result);
1537
1538## Return the original if no match
1539%!assert (regexprep ('hello', 'world', 'earth'), 'hello')
1540
1541## Test emptymatch option
1542%!assert (regexprep ('World', '^', 'Hello '), 'World')
1543%!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
1544
1545## Test a general replacement
1546%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")
1547
1548## Make sure replacements work at the beginning and end of string
1549%!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
1550%!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")
1551
1552## Test options "once" and "ignorecase"
1553%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
1554%! "a_b]c{d}e-f=g")
1555%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
1556%! "a_b_c_d_e_f_g")
1557
1558## Option combinations
1559%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
1560%! "a_b]c{d}e-f=g")
1561
1562## End conditions on replacement
1563%!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
1564%!assert (regexprep ("abc", "(b)", "$1"), "abc")
1565%!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
1566%!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")
1567
1568## Test cell array arguments
1569%!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
1570%!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
1571%!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})
1572
1573## Nasty lookbehind expression
1574%!test
1575%! warning ("off", "Octave:regexp-lookbehind-limit", "local");
1576%! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\‍(\-[1-9]*\‍)',
1577%! '_minus1'),'x^(-1)+y_minus1+z_minus1=0');
1578
1579## Verify escape sequences in pattern
1580%!assert (regexprep ("\n", '\n', "X"), "X")
1581%!assert (regexprep ("\n", "\n", "X"), "X")
1582
1583## Verify NULLs in pattern and replacement string
1584%!assert (regexprep ("A\0A", "\0", ","), "A,A")
1585%!assert (regexprep ("A\0A", '\0', ","), "A,A")
1586%!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
1587%!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")
1588
1589## Empty matches were broken on ARM architecture
1590%!test <*52810>
1591%! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"), "\nabc"))
1592*/
1593
1594OCTAVE_NAMESPACE_END
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:411
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:487
Definition: Cell.h:43
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
octave_idx_type numel(int n=0) const
Number of elements that a matrix with this dimensions would have.
Definition: dim-vector.h:335
void assign(const std::string &k, const Cell &val)
Definition: oct-map.h:365
const_iterator begin(void) const
Definition: oct-map.h:318
Cell cell_value(void) const
Definition: ovl.h:105
void resize(octave_idx_type n, const octave_value &rfv=octave_value())
Definition: ovl.h:117
octave_idx_type length(void) const
Definition: ovl.h:113
octave_idx_type numel(void) const
Definition: str-vec.h:100
OCTINTERP_API void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void warning(const char *fmt,...)
Definition: error.cc:1055
void error(const char *fmt,...)
Definition: error.cc:980
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5861
class OCTAVE_API Matrix
Definition: mx-fwd.h:31
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value_list ovl(const OV_Args &... args)
Construct an octave_value_list with less typing.
Definition: ovl.h:211
static octave_value_list octregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:364
static void parse_options(regexp::opts &options, const octave_value_list &args, const std::string &who, int skip, bool &extra_args)
Definition: regexp.cc:315
static OCTAVE_NAMESPACE_BEGIN std::string do_regexp_ptn_string_escapes(const std::string &s, bool is_sq_str)
Definition: regexp.cc:57
static octave_value octregexprep(const octave_value_list &args, const std::string &who)
Definition: regexp.cc:1364
static std::string do_regexp_rep_string_escapes(const std::string &s)
Definition: regexp.cc:147
static octave_value_list octcellregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:548
F77_RET_T len
Definition: xerbla.cc:61