GNU Octave  6.2.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
regexp.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2002-2021 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <list>
31 #include <sstream>
32 
33 #include <pcre.h>
34 
35 #include "base-list.h"
36 #include "oct-locbuf.h"
37 #include "quit.h"
38 #include "lo-regexp.h"
39 #include "str-vec.h"
40 
41 #include "defun.h"
42 #include "Cell.h"
43 #include "error.h"
44 #include "errwarn.h"
45 #include "oct-map.h"
46 #include "ovl.h"
47 #include "utils.h"
48 
49 // Replace backslash escapes in a string with the real values. We need
50 // two special functions instead of the one in utils.cc because the set
51 // of escape sequences used for regexp patterns and replacement strings
52 // is different from those used in the *printf functions.
53 
54 static std::string
55 do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
56 {
57  std::string retval;
58 
59  size_t i = 0;
60  size_t j = 0;
61  size_t len = s.length ();
62 
63  retval.resize (len+i);
64 
65  while (j < len)
66  {
67  if (s[j] == '\\' && j+1 < len)
68  {
69  switch (s[++j])
70  {
71  case 'b': // backspace
72  if (is_sq_str)
73  retval[i] = '\b';
74  else
75  {
76  // Pass escape sequence through
77  retval[i] = '\\';
78  retval[++i] = 'b';
79  }
80  break;
81 
82  // Translate < and > to PCRE word boundary
83  case '<': // begin word boundary
84  case '>': // end word boundary
85  retval[i] = '\\';
86  retval[++i] = 'b';
87  break;
88 
89  case 'o': // octal input
90  {
91  bool bad_esc_seq = (j+1 >= len);
92 
93  bool brace = false;
94  if (! bad_esc_seq && s[++j] == '{')
95  {
96  brace = true;
97  j++;
98  }
99 
100  int tmpi = 0;
101  size_t k;
102  for (k = j; k < std::min (j+3+brace, len); k++)
103  {
104  int digit = s[k] - '0';
105  if (digit < 0 || digit > 7)
106  break;
107  tmpi <<= 3;
108  tmpi += digit;
109  }
110  if (bad_esc_seq || (brace && s[k++] != '}'))
111  {
112  tmpi = 0;
113  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
114  }
115  retval[i] = tmpi;
116  j = k - 1;
117  break;
118  }
119 
120  default: // pass escape sequence through
121  retval[i] = '\\';
122  retval[++i] = s[j];
123  break;
124  }
125  }
126  else
127  {
128  retval[i] = s[j];
129  }
130 
131  i++;
132  j++;
133  }
134 
135  retval.resize (i);
136 
137  return retval;
138 }
139 
140 static std::string
141 do_regexp_rep_string_escapes (const std::string& s)
142 {
143  std::string retval;
144 
145  size_t i = 0;
146  size_t j = 0;
147  size_t len = s.length ();
148 
149  retval.resize (len);
150 
151  while (j < len)
152  {
153  if (s[j] == '\\' && j+1 < len)
154  {
155  switch (s[++j])
156  {
157  case 'a': // alarm
158  retval[i] = '\a';
159  break;
160 
161  case 'b': // backspace
162  retval[i] = '\b';
163  break;
164 
165  case 'f': // formfeed
166  retval[i] = '\f';
167  break;
168 
169  case 'n': // newline
170  retval[i] = '\n';
171  break;
172 
173  case 'r': // carriage return
174  retval[i] = '\r';
175  break;
176 
177  case 't': // horizontal tab
178  retval[i] = '\t';
179  break;
180 
181  case 'v': // vertical tab
182  retval[i] = '\v';
183  break;
184 
185  case '0':
186  case '1':
187  case '2':
188  case '3':
189  case '4':
190  case '5':
191  case '6':
192  case '7': // octal input
193  {
194  size_t k;
195  int tmpi = s[j] - '0';
196  for (k = j+1; k < std::min (j+3, len); k++)
197  {
198  int digit = s[k] - '0';
199  if (digit < 0 || digit > 7)
200  break;
201  tmpi <<= 3;
202  tmpi += digit;
203  }
204  retval[i] = tmpi;
205  j = k - 1;
206  break;
207  }
208 
209  case 'o': // octal input
210  {
211  bool bad_esc_seq = (j+1 >= len);
212 
213  bool brace = false;
214  if (! bad_esc_seq && s[++j] == '{')
215  {
216  brace = true;
217  j++;
218  }
219 
220  int tmpi = 0;
221  size_t k;
222  for (k = j; k < std::min (j+3+brace, len); k++)
223  {
224  int digit = s[k] - '0';
225  if (digit < 0 || digit > 7)
226  break;
227  tmpi <<= 3;
228  tmpi += digit;
229  }
230  if (bad_esc_seq || (brace && s[k++] != '}'))
231  {
232  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
233  tmpi = 0;
234  }
235  retval[i] = tmpi;
236  j = k - 1;
237  break;
238  }
239 
240  case 'x': // hex input
241  {
242  bool bad_esc_seq = (j+1 >= len);
243 
244  bool brace = false;
245  if (! bad_esc_seq && s[++j] == '{')
246  {
247  brace = true;
248  j++;
249  }
250 
251  int tmpi = 0;
252  size_t k;
253  for (k = j; k < std::min (j+2+brace, len); k++)
254  {
255  if (! isxdigit (s[k]))
256  break;
257 
258  tmpi <<= 4;
259  int digit = s[k];
260  if (digit >= 'a')
261  tmpi += digit - 'a' + 10;
262  else if (digit >= 'A')
263  tmpi += digit - 'A' + 10;
264  else
265  tmpi += digit - '0';
266  }
267  if (bad_esc_seq || (brace && s[k++] != '}'))
268  {
269  warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
270  tmpi = 0;
271  }
272  retval[i] = tmpi;
273  j = k - 1;
274  break;
275  }
276 
277  // Both dollar sign (for capture buffer) and backslash are
278  // passed through with their escape backslash. The processing
279  // for these must occur during the actual replacement operation
280  // in lo-regexp.cc.
281  case '$': // pass dollar sign through with escape
282  retval[i] = '\\'; retval[++i] = '$';
283  break;
284 
285  case '\\': // pass backslash through with escape
286  retval[i] = '\\'; retval[++i] = '\\';
287  break;
288 
289  default: // convert escaped character to unescaped char
290  retval[i] = s[j];
291  break;
292  }
293  }
294  else
295  {
296  retval[i] = s[j];
297  }
298 
299  i++;
300  j++;
301  }
302 
303  retval.resize (i);
304 
305  return retval;
306 }
307 
308 static void
310  const std::string& who, int skip, bool& extra_args)
311 {
312  extra_args = false;
313 
314  for (int i = skip; i < args.length (); i++)
315  {
316  std::string str;
317 
318  str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());
319 
320  std::transform (str.begin (), str.end (), str.begin (), tolower);
321 
322  if (str.find ("once", 0) == 0)
323  options.once (true);
324  else if (str.find ("matchcase", 0) == 0)
325  options.case_insensitive (false);
326  else if (str.find ("ignorecase", 0) == 0)
327  options.case_insensitive (true);
328  else if (str.find ("dotall", 0) == 0)
329  options.dotexceptnewline (false);
330  else if (str.find ("stringanchors", 0) == 0)
331  options.lineanchors (false);
332  else if (str.find ("literalspacing", 0) == 0)
333  options.freespacing (false);
334  else if (str.find ("noemptymatch", 0) == 0)
335  options.emptymatch (false);
336  else if (str.find ("dotexceptnewline", 0) == 0)
337  options.dotexceptnewline (true);
338  else if (str.find ("lineanchors", 0) == 0)
339  options.lineanchors (true);
340  else if (str.find ("freespacing", 0) == 0)
341  options.freespacing (true);
342  else if (str.find ("emptymatch", 0) == 0)
343  options.emptymatch (true);
344  else if (str.find ("start", 0) == 0
345  || str.find ("end", 0) == 0
346  || str.find ("tokenextents", 0) == 0
347  || str.find ("match", 0) == 0
348  || str.find ("tokens", 0) == 0
349  || str.find ("names", 0) == 0
350  || str.find ("split", 0) == 0)
351  extra_args = true;
352  else
353  error ("%s: unrecognized option", who.c_str ());
354  }
355 }
356 
357 static octave_value_list
358 octregexp (const octave_value_list& args, int nargout,
359  const std::string& who, bool case_insensitive = false)
360 {
362 
363  int nargin = args.length ();
364 
365  // Make sure we have string, pattern
366  const std::string buffer = args(0).string_value ();
367 
368  std::string pattern = args(1).string_value ();
369 
370  // Rewrite pattern for PCRE
371  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
372 
373  octave::regexp::opts options;
374  options.case_insensitive (case_insensitive);
375  bool extra_options = false;
376  parse_options (options, args, who, 2, extra_options);
377 
378  const octave::regexp::match_data rx_lst
379  = octave::regexp::match (pattern, buffer, options, who);
380 
381  string_vector named_pats = rx_lst.named_patterns ();
382 
383  size_t sz = rx_lst.size ();
384 
385  // Converted the linked list in the correct form for the return values
386 
387  octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);
388 
389  retval.resize (7);
390 
391  if (sz != 0)
392  {
393  for (int j = 0; j < named_pats.numel (); j++)
394  {
395  Cell ctmp (dim_vector (1, sz));
396  octave_idx_type i = 0;
397 
398  for (const auto& match_data : rx_lst)
399  {
400  string_vector named_tokens = match_data.named_tokens ();
401 
402  ctmp(i++) = named_tokens(j);
403  }
404 
405  nmap.assign (named_pats(j), ctmp);
406  }
407  }
408  retval(5) = nmap;
409 
410  if (options.once ())
411  {
412  auto p = rx_lst.begin ();
413 
414  retval(4) = (sz ? p->tokens () : Cell ());
415  retval(3) = (sz ? p->match_string () : "");
416  retval(2) = (sz ? p->token_extents () : Matrix ());
417 
418  if (sz)
419  {
420  double start = p->start ();
421  double end = p->end ();
422 
423  Cell split (dim_vector (1, 2));
424  split(0) = buffer.substr (0, start-1);
425  split(1) = buffer.substr (end);
426 
427  retval(6) = split;
428  retval(1) = end;
429  retval(0) = start;
430  }
431  else
432  {
433  retval(6) = buffer;
434  retval(1) = Matrix ();
435  retval(0) = Matrix ();
436  }
437  }
438  else
439  {
440  Cell tokens (dim_vector (1, sz));
441  Cell match_string (dim_vector (1, sz));
442  Cell token_extents (dim_vector (1, sz));
443  NDArray end (dim_vector (1, sz));
444  NDArray start (dim_vector (1, sz));
445  Cell split (dim_vector (1, sz+1));
446  size_t sp_start = 0;
447 
448  octave_idx_type i = 0;
449  for (const auto& match_data : rx_lst)
450  {
451  double s = match_data.start ();
452  double e = match_data.end ();
453 
454  string_vector tmp = match_data.tokens ();
455  tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
456  match_string(i) = match_data.match_string ();
457  token_extents(i) = match_data.token_extents ();
458  end(i) = e;
459  start(i) = s;
460  split(i) = buffer.substr (sp_start, s-sp_start-1);
461  sp_start = e;
462  i++;
463  }
464 
465  split(i) = buffer.substr (sp_start);
466 
467  retval(6) = split;
468  retval(4) = tokens;
469  retval(3) = match_string;
470  retval(2) = token_extents;
471  retval(1) = end;
472  retval(0) = start;
473  }
474 
475  // Alter the order of the output arguments
476 
477  if (extra_options)
478  {
479  int n = 0;
480  octave_value_list new_retval;
481  new_retval.resize (nargout);
482 
483  bool arg_used[7] {};
484 
485  for (int j = 2; j < nargin; j++)
486  {
487  int k = 0;
488  std::string str = args(j).string_value ();
489  std::transform (str.begin (), str.end (), str.begin (), tolower);
490 
491  if (str.find ("once", 0) == 0
492  || str.find ("stringanchors", 0) == 0
493  || str.find ("lineanchors", 0) == 0
494  || str.find ("matchcase", 0) == 0
495  || str.find ("ignorecase", 0) == 0
496  || str.find ("dotall", 0) == 0
497  || str.find ("dotexceptnewline", 0) == 0
498  || str.find ("literalspacing", 0) == 0
499  || str.find ("freespacing", 0) == 0
500  || str.find ("noemptymatch", 0) == 0
501  || str.find ("emptymatch", 0) == 0)
502  continue;
503  else if (str.find ("start", 0) == 0)
504  k = 0;
505  else if (str.find ("end", 0) == 0)
506  k = 1;
507  else if (str.find ("tokenextents", 0) == 0)
508  k = 2;
509  else if (str.find ("match", 0) == 0)
510  k = 3;
511  else if (str.find ("tokens", 0) == 0)
512  k = 4;
513  else if (str.find ("names", 0) == 0)
514  k = 5;
515  else if (str.find ("split", 0) == 0)
516  k = 6;
517 
518  new_retval(n++) = retval(k);
519  arg_used[k] = true;
520 
521  if (n == nargout)
522  break;
523  }
524 
525  // Fill in the rest of the arguments
526  if (n < nargout)
527  {
528  for (int j = 0; j < 7; j++)
529  {
530  if (! arg_used[j])
531  new_retval(n++) = retval(j);
532  }
533  }
534 
535  retval = new_retval;
536  }
537 
538  return retval;
539 }
540 
541 static octave_value_list
542 octcellregexp (const octave_value_list& args, int nargout,
543  const std::string& who, bool case_insensitive = false)
544 {
546 
547  if (args(0).iscell ())
548  {
549  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
550  octave_value_list new_args = args;
551  Cell cellstr = args(0).cell_value ();
552  if (args(1).iscell ())
553  {
554  Cell cellpat = args(1).cell_value ();
555 
556  if (cellpat.numel () == 1)
557  {
558  for (int j = 0; j < nargout; j++)
559  newretval[j].resize (cellstr.dims ());
560 
561  new_args(1) = cellpat(0);
562 
563  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
564  {
565  new_args(0) = cellstr(i);
566  octave_value_list tmp = octregexp (new_args, nargout, who,
567  case_insensitive);
568 
569  for (int j = 0; j < nargout; j++)
570  newretval[j](i) = tmp(j);
571  }
572  }
573  else if (cellstr.numel () == 1)
574  {
575  for (int j = 0; j < nargout; j++)
576  newretval[j].resize (cellpat.dims ());
577 
578  new_args(0) = cellstr(0);
579 
580  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
581  {
582  new_args(1) = cellpat(i);
583  octave_value_list tmp = octregexp (new_args, nargout, who,
584  case_insensitive);
585 
586  for (int j = 0; j < nargout; j++)
587  newretval[j](i) = tmp(j);
588  }
589  }
590  else if (cellstr.numel () == cellpat.numel ())
591  {
592  if (cellstr.dims () != cellpat.dims ())
593  error ("%s: inconsistent cell array dimensions", who.c_str ());
594 
595  for (int j = 0; j < nargout; j++)
596  newretval[j].resize (cellstr.dims ());
597 
598  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
599  {
600  new_args(0) = cellstr(i);
601  new_args(1) = cellpat(i);
602 
603  octave_value_list tmp = octregexp (new_args, nargout, who,
604  case_insensitive);
605 
606  for (int j = 0; j < nargout; j++)
607  newretval[j](i) = tmp(j);
608  }
609  }
610  else
611  error ("regexp: cell array arguments must be scalar or equal size");
612  }
613  else
614  {
615  for (int j = 0; j < nargout; j++)
616  newretval[j].resize (cellstr.dims ());
617 
618  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
619  {
620  new_args(0) = cellstr(i);
621  octave_value_list tmp = octregexp (new_args, nargout, who,
622  case_insensitive);
623 
624  for (int j = 0; j < nargout; j++)
625  newretval[j](i) = tmp(j);
626  }
627  }
628 
629  for (int j = 0; j < nargout; j++)
630  retval(j) = octave_value (newretval[j]);
631  }
632  else if (args(1).iscell ())
633  {
634  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
635  octave_value_list new_args = args;
636  Cell cellpat = args(1).cell_value ();
637 
638  for (int j = 0; j < nargout; j++)
639  newretval[j].resize (cellpat.dims ());
640 
641  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
642  {
643  new_args(1) = cellpat(i);
644  octave_value_list tmp = octregexp (new_args, nargout, who,
645  case_insensitive);
646 
647  for (int j = 0; j < nargout; j++)
648  newretval[j](i) = tmp(j);
649  }
650 
651  for (int j = 0; j < nargout; j++)
652  retval(j) = octave_value (newretval[j]);
653  }
654  else
655  retval = octregexp (args, nargout, who, case_insensitive);
656 
657  return retval;
658 
659 }
660 
661 DEFUN (regexp, args, nargout,
662  doc: /* -*- texinfo -*-
663 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
664 @deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
665 Regular expression string matching.
666 
667 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
668 substrings of any matches, or empty values if there are none.
669 
670 The matched pattern @var{pat} can include any of the standard regex
671 operators, including:
672 
673 @table @code
674 @item .
675 Match any character
676 
677 @item * + ? @{@}
678 Repetition operators, representing
679 
680 @table @code
681 @item *
682 Match zero or more times
683 
684 @item +
685 Match one or more times
686 
687 @item ?
688 Match zero or one times
689 
690 @item @{@var{n}@}
691 Match exactly @var{n} times
692 
693 @item @{@var{n},@}
694 Match @var{n} or more times
695 
696 @item @{@var{m},@var{n}@}
697 Match between @var{m} and @var{n} times
698 @end table
699 
700 @item [@dots{}] [^@dots{}]
701 
702 List operators. The pattern will match any character listed between
703 @qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the
704 pattern is inverted and any character except those listed between brackets
705 will match.
706 
707 Escape sequences defined below can also be used inside list operators. For
708 example, a template for a floating point number might be @code{[-+.\d]+}.
709 
710 @item () (?:)
711 Grouping operator. The first form, parentheses only, also creates a token.
712 
713 @item |
714 Alternation operator. Match one of a choice of regular expressions. The
715 alternatives must be delimited by the grouping operator @code{()} above.
716 
717 @item ^ $
718 Anchoring operators. Requires pattern to occur at the start (@code{^}) or
719 end (@code{$}) of the string.
720 @end table
721 
722 In addition, the following escaped characters have special meaning.
723 
724 @table @code
725 
726 @item \d
727 Match any digit
728 
729 @item \D
730 Match any non-digit
731 
732 @item \s
733 Match any whitespace character
734 
735 @item \S
736 Match any non-whitespace character
737 
738 @item \w
739 Match any word character
740 
741 @item \W
742 Match any non-word character
743 
744 @item <
745 Match the beginning of a word
746 
747 @item >
748 Match the end of a word
749 
750 @item \B
751 Match within a word
752 @end table
753 
754 Implementation Note: For compatibility with @sc{matlab}, escape sequences
755 in @var{pat} (e.g., @qcode{"@xbackslashchar{}n"} => newline) are expanded
756 even when @var{pat} has been defined with single quotes. To disable
757 expansion use a second backslash before the escape sequence (e.g.,
758 "@xbackslashchar{}@xbackslashchar{}n") or use the @code{regexptranslate}
759 function.
760 
761 The outputs of @code{regexp} default to the order given below
762 
763 @table @var
764 @item s
765 The start indices of each matching substring
766 
767 @item e
768 The end indices of each matching substring
769 
770 @item te
771 The extents of each matched token surrounded by @code{(@dots{})} in
772 @var{pat}
773 
774 @item m
775 A cell array of the text of each match
776 
777 @item t
778 A cell array of the text of each token matched
779 
780 @item nm
781 A structure containing the text of each matched named token, with the name
782 being used as the fieldname. A named token is denoted by
783 @code{(?<name>@dots{})}.
784 
785 @item sp
786 A cell array of the text not returned by match, i.e., what remains if you
787 split the string based on @var{pat}.
788 @end table
789 
790 Particular output arguments, or the order of the output arguments, can be
791 selected by additional @var{opt} arguments. These are strings and the
792 correspondence between the output arguments and the optional argument
793 are
794 
795 @multitable @columnfractions 0.2 0.3 0.3 0.2
796 @item @tab @qcode{'start'} @tab @var{s} @tab
797 @item @tab @qcode{'end'} @tab @var{e} @tab
798 @item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
799 @item @tab @qcode{'match'} @tab @var{m} @tab
800 @item @tab @qcode{'tokens'} @tab @var{t} @tab
801 @item @tab @qcode{'names'} @tab @var{nm} @tab
802 @item @tab @qcode{'split'} @tab @var{sp} @tab
803 @end multitable
804 
805 Additional arguments are summarized below.
806 
807 @table @samp
808 @item once
809 Return only the first occurrence of the pattern.
810 
811 @item matchcase
812 Make the matching case sensitive. (default)
813 
814 Alternatively, use (?-i) in the pattern.
815 
816 @item ignorecase
817 Ignore case when matching the pattern to the string.
818 
819 Alternatively, use (?i) in the pattern.
820 
821 @item stringanchors
822 Match the anchor characters at the beginning and end of the string.
823 (default)
824 
825 Alternatively, use (?-m) in the pattern.
826 
827 @item lineanchors
828 Match the anchor characters at the beginning and end of the line.
829 
830 Alternatively, use (?m) in the pattern.
831 
832 @item dotall
833 The pattern @code{.} matches all characters including the newline character.
834  (default)
835 
836 Alternatively, use (?s) in the pattern.
837 
838 @item dotexceptnewline
839 The pattern @code{.} matches all characters except the newline character.
840 
841 Alternatively, use (?-s) in the pattern.
842 
843 @item literalspacing
844 All characters in the pattern, including whitespace, are significant and are
845 used in pattern matching. (default)
846 
847 Alternatively, use (?-x) in the pattern.
848 
849 @item freespacing
850 The pattern may include arbitrary whitespace and also comments beginning
851 with the character @samp{#}.
852 
853 Alternatively, use (?x) in the pattern.
854 
855 @item noemptymatch
856 Zero-length matches are not returned. (default)
857 
858 @item emptymatch
859 Return zero-length matches.
860 
861 @code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
862 are zero or more @qcode{'b'} characters at positions 1 and end-of-string.
863 
864 @end table
865 
866 Stack Limitation Note: Pattern searches are done with a recursive function
867 which can overflow the program stack when there are a high number of matches.
868 For example,
869 
870 @example
871 @code{regexp (repmat ('a', 1, 1e5), '(a)+')}
872 @end example
873 
874 @noindent
875 may lead to a segfault. As an alternative, consider constructing pattern
876 searches that reduce the number of matches (e.g., by creatively using set
877 complement), and then further processing the return variables (now reduced in
878 size) with successive @code{regexp} searches.
879 @seealso{regexpi, strfind, regexprep}
880 @end deftypefn */)
881 {
882  if (args.length () < 2)
883  print_usage ();
884 
886 
887  if (args(0).iscell () || args(1).iscell ())
888  retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
889  else
890  retval = octregexp (args, nargout, "regexp");
891 
892  return retval;
893 }
894 
895 /*
896 ## PCRE_ERROR_MATCHLIMIT test
897 %!test
898 %! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
899 %! ws = warning ("query");
900 %! unwind_protect
901 %! warning ("off");
902 %! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
903 %! unwind_protect_cleanup
904 %! warning (ws);
905 %! end_unwind_protect
906 
907 ## segfault test
908 %!assert (regexp ("abcde", "."), [1,2,3,4,5])
909 ## Infinite loop test
910 %!assert (isempty (regexp ("abcde", "")))
911 
912 ## Check that anchoring of pattern works correctly
913 %!assert (regexp ('abcabc', '^abc'), 1)
914 %!assert (regexp ('abcabc', 'abc$'), 4)
915 %!assert (regexp ('abcabc', '^abc$'), zeros (1,0))
916 
917 %!test
918 %! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
919 %! assert (s, zeros (1,0));
920 %! assert (e, zeros (1,0));
921 %! assert (te, cell (1,0));
922 %! assert (m, cell (1,0));
923 %! assert (t, cell (1,0));
924 
925 %!test
926 %! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
927 %! assert (s, zeros (1,0));
928 %! assert (e, zeros (1,0));
929 %! assert (te, cell (1,0));
930 %! assert (m, cell (1,0));
931 %! assert (t, cell (1,0));
932 
933 %!test
934 %! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
935 %! assert (s, 2);
936 %! assert (e, 10);
937 %! assert (te{1}, [3, 7]);
938 %! assert (m{1}, 'firetruck');
939 %! assert (t{1}{1}, 'iretr');
940 
941 %!test
942 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
943 %! assert (s, [1, 12]);
944 %! assert (e, [5, 17]);
945 %! assert (size (te), [1, 2]);
946 %! assert (isempty (te{1}));
947 %! assert (isempty (te{2}));
948 %! assert (m{1}, 'short');
949 %! assert (m{2}, 'string');
950 %! assert (size (t), [1, 2]);
951 %! assert (isempty (t{1}));
952 %! assert (isempty (t{2}));
953 
954 %!test
955 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
956 %! assert (s, 1);
957 %! assert (e, 5);
958 %! assert (isempty (te));
959 %! assert (m, 'short');
960 %! assert (isempty (t));
961 
962 %!test
963 %! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
964 %! assert (s, 1);
965 %! assert (e, 5);
966 %! assert (isempty (te));
967 %! assert (m, 'short');
968 %! assert (isempty (t));
969 
970 %!test
971 %! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
972 %! assert (s, 1);
973 %! assert (e, 10);
974 %! assert (size (te), [1, 1]);
975 %! assert (te{1}, [1,5; 7,10]);
976 %! assert (m{1}, 'short test');
977 %! assert (size (t), [1, 1]);
978 %! assert (t{1}{1}, 'short');
979 %! assert (t{1}{2}, 'test');
980 %! assert (size (nm), [1, 1]);
981 %! assert (! isempty (fieldnames (nm)));
982 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
983 %! assert (nm.word1, 'short');
984 %! assert (nm.word2, 'test');
985 
986 %!test
987 %! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
988 %! assert (s, 1);
989 %! assert (e, 10);
990 %! assert (size (te), [1, 1]);
991 %! assert (te{1}, [1,5; 7,10]);
992 %! assert (m{1}, 'short test');
993 %! assert (size (t), [1, 1]);
994 %! assert (t{1}{1}, 'short');
995 %! assert (t{1}{2}, 'test');
996 %! assert (size (nm), [1, 1]);
997 %! assert (! isempty (fieldnames (nm)));
998 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
999 %! assert (nm.word1, 'short');
1000 %! assert (nm.word2, 'test');
1001 
1002 %!test
1003 %! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
1004 %! assert (size (t), [1, 2]);
1005 %! assert (t{1}{1}, "John");
1006 %! assert (t{1}{2}, "Davis");
1007 %! assert (t{2}{1}, "Rogers");
1008 %! assert (t{2}{2}, "James");
1009 %! assert (size (nm), [1, 2]);
1010 %! assert (nm(1).first, "John");
1011 %! assert (nm(1).last, "Davis");
1012 %! assert (nm(2).first, "James");
1013 %! assert (nm(2).last, "Rogers");
1014 
1015 ## Tests for nulls in strings properly matching
1016 %!test
1017 %! str = "A\0B\0\0C";
1018 %! ptn = '(\0+)'; # also test null in single-quote pattern
1019 %! M = regexp (str, ptn, "match");
1020 %! assert (size (M), [1, 2]);
1021 %! assert (double (M{1}), [0]);
1022 %! assert (double (M{2}), [0, 0]);
1023 
1024 %!test
1025 %! str = "A\0B\0\0C";
1026 %! ptn = "(\0+)"; # also test null in double-quote pattern
1027 %! T = regexp (str, ptn, "tokens");
1028 %! assert (size (T), [1, 2]);
1029 %! assert (double (T{1}{1}), [0]);
1030 %! assert (double (T{2}{1}), [0, 0]);
1031 
1032 %!test
1033 %! str = "A\0B\0\0C";
1034 %! ptn = '(?<namedtoken>\0+)';
1035 %! NT = regexp (str, ptn, "names");
1036 %! assert (size (NT), [1, 2]);
1037 %! assert (double (NT(1).namedtoken), [0]);
1038 %! assert (double (NT(2).namedtoken), [0, 0]);
1039 
1040 ## Tests for named tokens
1041 %!test
1042 %! ## Parenthesis in named token (ie (int)) causes a problem
1043 %! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
1044 %! struct ('typestr', 'int'));
1045 
1046 %!test <*35683>
1047 %! ## Mix of named and unnamed tokens can cause segfault
1048 %! str = "abcde";
1049 %! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
1050 %! tokens = regexp (str, ptn, "names");
1051 %! assert (isstruct (tokens) && numel (tokens) == 1);
1052 %! assert (tokens.T1, "a");
1053 %! assert (tokens.T2, "de");
1054 
1055 ## Test options to regexp
1056 %!assert (regexp ("abc\nabc", '.'), [1:7])
1057 %!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
1058 %!test
1059 %! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
1060 %! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1061 %! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1062 
1063 %!assert (regexp ("caseCaSe", 'case'), 1)
1064 %!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
1065 %!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
1066 %!test
1067 %! assert (regexp ("caseCaSe", '(?-i)case'), 1);
1068 %! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);
1069 
1070 %!assert (regexp ("abc\nabc", 'c$'), 7)
1071 %!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
1072 %!test
1073 %! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
1074 %! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
1075 %! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);
1076 
1077 %!assert (regexp ("this word", 's w'), 4)
1078 %!assert (regexp ("this word", 's w', 'literalspacing'), 4)
1079 %!test
1080 %! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
1081 %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
1082 %! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
1083 
1084 %!test
1085 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
1086 %! assert (s, [1 5]);
1087 %! assert (e, [3 5]);
1088 %! assert (te, { zeros(0,2), zeros(0,2) });
1089 %! assert (m, { "OCT", "V" });
1090 %! assert (t, { cell(1,0), cell(1,0) });
1091 %! assert (isempty (fieldnames (nm)));
1092 %! assert (sp, { "", "A", "E" });
1093 
1094 %!test
1095 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
1096 %! assert (s, [1 5]);
1097 %! assert (e, [3 5]);
1098 %! assert (te, { [1 3], [5 5] });
1099 %! assert (m, { "OCT", "V" });
1100 %! assert (t, { {"OCT"}, {"V"} });
1101 %! assert (isempty (fieldnames (nm)));
1102 %! assert (sp, { "", "A", "E" });
1103 
1104 %!test
1105 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
1106 %! assert (s, [1 4 5 6 7]);
1107 %! assert (e, [3 3 5 5 6]);
1108 %! assert (te, repmat ({zeros(0,2)}, [1, 5]));
1109 %! assert (m, { "OCT", "", "V", "", "" });
1110 %! assert (t, repmat({cell(1,0)}, [1, 5]));
1111 %! assert (isempty (fieldnames (nm)));
1112 %! assert (sp, { "", "", "A", "", "E", "" });
1113 
1114 %!test
1115 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
1116 %! assert (s, [1 4 5 6 7]);
1117 %! assert (e, [3 3 5 5 6]);
1118 %! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
1119 %! assert (m, { "OCT", "", "V", "", "" });
1120 %! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
1121 %! assert (isempty (fieldnames (nm)));
1122 %! assert (sp, { "", "", "A", "", "E", "" });
1123 
1124 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1,0)})
1125 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]})
1126 %!assert (regexp ('Strings', {'t','s'}), {2, 7})
1127 
1128 ## Test case for lookaround operators
1129 %!test
1130 %! assert (regexp ('Iraq', 'q(?!u)'), 4);
1131 %! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
1132 %! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
1133 %! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
1134 %! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0));
1135 %! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'});
1136 %! assert (regexp ('thingamabob', '(?<=a)b'), 9);
1137 
1138 ## Tests for split option.
1139 %!shared str
1140 %! str = "foo bar foo";
1141 %!test
1142 %! [a, b] = regexp (str, "f..", "match", "split");
1143 %! assert (a, {"foo", "foo"});
1144 %! assert (b, {"", " bar ", ""});
1145 %!test
1146 %! [a, b] = regexp (str, "f..", "match", "split", "once");
1147 %! assert (a, "foo");
1148 %! assert (b, {"", " bar foo"});
1149 %!test
1150 %! [a, b] = regexp (str, "fx.", "match", "split");
1151 %! assert (a, cell (1, 0));
1152 %! assert (b, {"foo bar foo"});
1153 %!test
1154 %! [a, b] = regexp (str, "fx.", "match", "split", "once");
1155 %! assert (a, "");;
1156 %! assert (b, "foo bar foo");
1157 
1158 %!shared str
1159 %! str = "foo bar";
1160 %!test
1161 %! [a, b] = regexp (str, "f..", "match", "split");
1162 %! assert (a, {"foo"});
1163 %! assert (b, {"", " bar"});
1164 %!test
1165 %! [a, b] = regexp (str, "b..", "match", "split");
1166 %! assert (a, {"bar"});
1167 %! assert (b, {"foo ", ""});
1168 %!test
1169 %! [a, b] = regexp (str, "x", "match", "split");
1170 %! assert (a, cell (1, 0));
1171 %! assert (b, {"foo bar"});
1172 %!test
1173 %! [a, b] = regexp (str, "[o]+", "match", "split");
1174 %! assert (a, {"oo"});
1175 %! assert (b, {"f", " bar"});
1176 
1177 ## Test escape sequences are expanded even in single-quoted strings
1178 %!assert (regexp ("\n", '\n'), 1)
1179 %!assert (regexp ("\n", "\n"), 1)
1180 
1181 # Test escape sequences are silently converted
1182 %!test <*45407>
1183 %! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
1184 %! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
1185 %! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
1186 
1187 ## Test input validation
1188 %!error regexp ('string', 'tri', 'BadArg')
1189 %!error regexp ('string')
1190 
1191 */
1192 
1193 DEFUN (regexpi, args, nargout,
1194  doc: /* -*- texinfo -*-
1195 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
1196 @deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})
1197 
1198 Case insensitive regular expression string matching.
1199 
1200 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
1201 substrings of any matches, or empty values if there are none.
1202 @xref{XREFregexp,,regexp}, for details on the syntax of the search pattern.
1203 @seealso{regexp}
1204 @end deftypefn */)
1205 {
1206  if (args.length () < 2)
1207  print_usage ();
1208 
1209  if (args(0).iscell () || args(1).iscell ())
1210  return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
1211  else
1212  return octregexp (args, nargout, "regexpi", true);
1213 }
1214 
1215 /*
1216 ## segfault test
1217 %!assert (regexpi ("abcde", "."), [1,2,3,4,5])
1218 
1219 ## Check that anchoring of pattern works correctly
1220 %!assert (regexpi ('abcabc', '^ABC'), 1)
1221 %!assert (regexpi ('abcabc', 'ABC$'), 4)
1222 %!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))
1223 
1224 %!test
1225 %! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
1226 %! assert (s, zeros (1,0));
1227 %! assert (e, zeros (1,0));
1228 %! assert (te, cell (1,0));
1229 %! assert (m, cell (1,0));
1230 %! assert (t, cell (1,0));
1231 
1232 %!test
1233 %! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
1234 %! assert (s, 2);
1235 %! assert (e, 10);
1236 %! assert (te{1}, [3, 7]);
1237 %! assert (m{1}, 'FiRetrUck');
1238 %! assert (t{1}{1}, 'iRetr');
1239 
1240 %!test
1241 %! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
1242 %! assert (s, 2);
1243 %! assert (e, 10);
1244 %! assert (te{1}, [3, 7]);
1245 %! assert (m{1}, 'firetruck');
1246 %! assert (t{1}{1}, 'iretr');
1247 
1248 %!test
1249 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
1250 %! assert (s, [1, 12]);
1251 %! assert (e, [5, 17]);
1252 %! assert (size (te), [1, 2]);
1253 %! assert (isempty (te{1}));
1254 %! assert (isempty (te{2}));
1255 %! assert (m{1}, 'ShoRt');
1256 %! assert (m{2}, 'String');
1257 %! assert (size (t), [1, 2]);
1258 %! assert (isempty (t{1}));
1259 %! assert (isempty (t{2}));
1260 
1261 %!test
1262 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
1263 %! assert (s, 1);
1264 %! assert (e, 5);
1265 %! assert (isempty (te));
1266 %! assert (m, 'ShoRt');
1267 %! assert (isempty (t));
1268 
1269 %!test
1270 %! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1271 %! assert (s, 1);
1272 %! assert (e, 5);
1273 %! assert (isempty (te));
1274 %! assert (m, 'ShoRt');
1275 %! assert (isempty (t));
1276 
1277 %!test
1278 %! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
1279 %! assert (s, 1);
1280 %! assert (e, 10);
1281 %! assert (size (te), [1, 1]);
1282 %! assert (te{1}, [1,5; 7,10]);
1283 %! assert (m{1}, 'ShoRt Test');
1284 %! assert (size (t), [1, 1]);
1285 %! assert (t{1}{1}, 'ShoRt');
1286 %! assert (t{1}{2}, 'Test');
1287 %! assert (size (nm), [1, 1]);
1288 %! assert (! isempty (fieldnames (nm)));
1289 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1290 %! assert (nm.word1, 'ShoRt');
1291 %! assert (nm.word2, 'Test');
1292 
1293 %!test
1294 %! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1295 %! assert (s, 1);
1296 %! assert (e, 10);
1297 %! assert (size (te), [1, 1]);
1298 %! assert (te{1}, [1,5; 7,10]);
1299 %! assert (m{1}, 'ShoRt Test');
1300 %! assert (size (t), [1, 1]);
1301 %! assert (t{1}{1}, 'ShoRt');
1302 %! assert (t{1}{2}, 'Test');
1303 %! assert (size (nm), [1, 1]);
1304 %! assert (! isempty (fieldnames (nm)));
1305 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1306 %! assert (nm.word1, 'ShoRt');
1307 %! assert (nm.word2, 'Test');
1308 
1309 %!assert (regexpi ("abc\nabc", '.'), [1:7])
1310 %!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
1311 %!test
1312 %! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
1313 %! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1314 %! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1315 
1316 %!assert (regexpi ("caseCaSe", 'case'), [1, 5])
1317 %!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
1318 %!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
1319 %!test
1320 %! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
1321 %! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);
1322 
1323 %!assert (regexpi ("abc\nabc", 'C$'), 7)
1324 %!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
1325 %!test
1326 %! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
1327 %! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
1328 %! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);
1329 
1330 %!assert (regexpi ("this word", 'S w'), 4)
1331 %!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
1332 %!test
1333 %! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
1334 %! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
1335 %! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));
1336 
1337 %!error regexpi ('string', 'tri', 'BadArg')
1338 %!error regexpi ('string')
1339 
1340 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1, 0)})
1341 %!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'), {6, [1,5,9], zeros(1,0)})
1342 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]})
1343 %!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})
1344 
1345 %!assert (regexpi ("\n", '\n'), 1)
1346 %!assert (regexpi ("\n", "\n"), 1)
1347 */
1348 
1349 static octave_value
1350 octregexprep (const octave_value_list& args, const std::string& who)
1351 {
1352  int nargin = args.length ();
1353 
1354  // Make sure we have string, pattern, replacement
1355  const std::string buffer = args(0).string_value ();
1356 
1357  std::string pattern = args(1).string_value ();
1358 
1359  // Rewrite pattern for PCRE
1360  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
1361 
1362  std::string replacement = args(2).string_value ();
1363 
1364  // Matlab compatibility.
1365  if (args(2).is_sq_string ())
1366  replacement = do_regexp_rep_string_escapes (replacement);
1367 
1368  // Pack options excluding 'tokenize' and various output
1369  // reordering strings into regexp arg list
1370  octave_value_list regexpargs (nargin-3, octave_value ());
1371 
1372  int len = 0;
1373  for (int i = 3; i < nargin; i++)
1374  {
1375  const std::string opt = args(i).string_value ();
1376  if (opt != "tokenize" && opt != "start" && opt != "end"
1377  && opt != "tokenextents" && opt != "match" && opt != "tokens"
1378  && opt != "names" && opt != "split" && opt != "warnings")
1379  {
1380  regexpargs(len++) = args(i);
1381  }
1382  }
1383  regexpargs.resize (len);
1384 
1385  octave::regexp::opts options;
1386  bool extra_args = false;
1387  parse_options (options, regexpargs, who, 0, extra_args);
1388 
1389  return octave::regexp::replace (pattern, buffer, replacement, options, who);
1390 }
1391 
1392 DEFUN (regexprep, args, ,
1393  doc: /* -*- texinfo -*-
1394 @deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
1395 @deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
1396 Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.
1397 
1398 The pattern is a regular expression as documented for @code{regexp}.
1399 @xref{XREFregexp,,regexp}.
1400 
1401 All strings must be UTF-8 encoded.
1402 
1403 The replacement string may contain @code{$i}, which substitutes for the ith
1404 set of parentheses in the match string. For example,
1405 
1406 @example
1407 regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
1408 @end example
1409 
1410 @noindent
1411 returns @qcode{"Dunn, Bill"}
1412 
1413 Options in addition to those of @code{regexp} are
1414 
1415 @table @samp
1416 
1417 @item once
1418 Replace only the first occurrence of @var{pat} in the result.
1419 
1420 @item warnings
1421 This option is present for compatibility but is ignored.
1422 
1423 @end table
1424 
1425 Implementation Note: For compatibility with @sc{matlab}, escape sequences
1426 in @var{pat} (e.g., @qcode{"@xbackslashchar{}n"} => newline) are expanded
1427 even when @var{pat} has been defined with single quotes. To disable
1428 expansion use a second backslash before the escape sequence (e.g.,
1429 "@xbackslashchar{}@xbackslashchar{}n") or use the @code{regexptranslate}
1430 function.
1431 @seealso{regexp, regexpi, strrep}
1432 @end deftypefn */)
1433 {
1434  if (args.length () < 3)
1435  print_usage ();
1436 
1438 
1439  if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
1440  {
1441  Cell str, pat, rep;
1442  dim_vector dv0;
1443  dim_vector dv1 (1, 1);
1444 
1445  if (args(0).iscell ())
1446  str = args(0).cell_value ();
1447  else
1448  str = Cell (args(0));
1449 
1450  if (args(1).iscell ())
1451  pat = args(1).cell_value ();
1452  else
1453  pat = Cell (args(1));
1454 
1455  if (args(2).iscell ())
1456  rep = args(2).cell_value ();
1457  else
1458  rep = Cell (args(2));
1459 
1460  dv0 = str.dims ();
1461  if (pat.numel () != 1)
1462  {
1463  dv1 = pat.dims ();
1464  if (rep.numel () != 1 && dv1 != rep.dims ())
1465  error ("regexprep: inconsistent cell array dimensions");
1466  }
1467  else if (rep.numel () != 1)
1468  dv1 = rep.dims ();
1469 
1470  Cell ret (dv0);
1471  octave_value_list new_args = args;
1472 
1473  for (octave_idx_type i = 0; i < dv0.numel (); i++)
1474  {
1475  new_args(0) = str(i);
1476  if (pat.numel () == 1)
1477  new_args(1) = pat(0);
1478  if (rep.numel () == 1)
1479  new_args(2) = rep(0);
1480 
1481  for (octave_idx_type j = 0; j < dv1.numel (); j++)
1482  {
1483  if (pat.numel () != 1)
1484  new_args(1) = pat(j);
1485  if (rep.numel () != 1)
1486  new_args(2) = rep(j);
1487  new_args(0) = octregexprep (new_args, "regexprep");
1488  }
1489 
1490  ret(i) = new_args(0);
1491  }
1492 
1493  retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
1494  }
1495  else
1496  retval = octregexprep (args, "regexprep");
1497 
1498  return retval;
1499 }
1500 
1501 /*
1502 %!test # Replace with empty
1503 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1504 %! t = regexprep (xml, '<[!?][^>]*>', '');
1505 %! assert (t, ' <tag v="hello">some stuff</tag>');
1506 
1507 %!test # Replace with non-empty
1508 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1509 %! t = regexprep (xml, '<[!?][^>]*>', '?');
1510 %! assert (t, '? <tag v="hello">some stuff?</tag>');
1511 
1512 %!test # Check that 'tokenize' is ignored
1513 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1514 %! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
1515 %! assert (t, ' <tag v="hello">some stuff</tag>');
1516 
1517 ## Test capture replacement
1518 %!test
1519 %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
1520 %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
1521 %! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
1522 %! assert (t, result);
1523 
1524 ## Return the original if no match
1525 %!assert (regexprep ('hello', 'world', 'earth'), 'hello')
1526 
1527 ## Test emptymatch option
1528 %!assert (regexprep ('World', '^', 'Hello '), 'World')
1529 %!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
1530 
1531 ## Test a general replacement
1532 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")
1533 
1534 ## Make sure replacements work at the beginning and end of string
1535 %!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
1536 %!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")
1537 
1538 ## Test options "once" and "ignorecase"
1539 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
1540 %! "a_b]c{d}e-f=g")
1541 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
1542 %! "a_b_c_d_e_f_g")
1543 
1544 ## Option combinations
1545 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
1546 %! "a_b]c{d}e-f=g")
1547 
1548 ## End conditions on replacement
1549 %!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
1550 %!assert (regexprep ("abc", "(b)", "$1"), "abc")
1551 %!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
1552 %!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")
1553 
1554 ## Test cell array arguments
1555 %!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
1556 %!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
1557 %!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})
1558 
1559 ## Nasty lookbehind expression
1560 %!test
1561 %! warning ("off", "Octave:regexp-lookbehind-limit", "local");
1562 %! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\‍(\-[1-9]*\‍)',
1563 %! '_minus1'),'x^(-1)+y_minus1+z_minus1=0');
1564 
1565 ## Verify escape sequences in pattern
1566 %!assert (regexprep ("\n", '\n', "X"), "X")
1567 %!assert (regexprep ("\n", "\n", "X"), "X")
1568 
1569 ## Verify NULLs in pattern and replacement string
1570 %!assert (regexprep ("A\0A", "\0", ","), "A,A")
1571 %!assert (regexprep ("A\0A", '\0', ","), "A,A")
1572 %!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
1573 %!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")
1574 
1575 ## Empty matches were broken on ARM architecture
1576 %!test <*52810>
1577 %! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"), "\nabc"))
1578 */
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
void resize(const dim_vector &dv, const T &rfv)
Size of the specified dimension.
Definition: Array.cc:1011
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:377
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:453
Definition: Cell.h:43
Definition: dMatrix.h:42
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:95
octave_idx_type numel(int n=0) const
Number of elements that a matrix with this dimensions would have.
Definition: dim-vector.h:401
size_t size(void) const
Definition: base-list.h:52
iterator begin(void)
Definition: base-list.h:65
string_vector named_patterns(void) const
Definition: lo-regexp.h:213
void lineanchors(bool val)
Definition: lo-regexp.h:143
void case_insensitive(bool val)
Definition: lo-regexp.h:139
void emptymatch(bool val)
Definition: lo-regexp.h:141
void once(bool val)
Definition: lo-regexp.h:144
void freespacing(bool val)
Definition: lo-regexp.h:142
void dotexceptnewline(bool val)
Definition: lo-regexp.h:140
match_data match(const std::string &buffer)
Definition: lo-regexp.cc:250
std::string replace(const std::string &buffer, const std::string &replacement)
Definition: lo-regexp.cc:468
void assign(const std::string &k, const Cell &val)
Definition: oct-map.h:365
Cell cell_value(void) const
Definition: ovl.h:105
void resize(octave_idx_type n, const octave_value &rfv=octave_value())
Definition: ovl.h:117
octave_idx_type length(void) const
Definition: ovl.h:113
octave_idx_type numel(void) const
Definition: str-vec.h:100
OCTINTERP_API void print_usage(void)
Definition: defun.cc:53
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void warning(const char *fmt,...)
Definition: error.cc:1050
void error(const char *fmt,...)
Definition: error.cc:968
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5814
octave_idx_type n
Definition: mx-inlines.cc:753
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value::octave_value(const Array< char > &chm, char type) return retval
Definition: ov.cc:811
octave_value_list ovl(const OV_Args &... args)
Construct an octave_value_list with less typing.
Definition: ovl.h:211
static octave_value_list octregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:358
static void parse_options(octave::regexp::opts &options, const octave_value_list &args, const std::string &who, int skip, bool &extra_args)
Definition: regexp.cc:309
static octave_value octregexprep(const octave_value_list &args, const std::string &who)
Definition: regexp.cc:1350
static std::string do_regexp_rep_string_escapes(const std::string &s)
Definition: regexp.cc:141
static std::string do_regexp_ptn_string_escapes(const std::string &s, bool is_sq_str)
Definition: regexp.cc:55
static octave_value_list octcellregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:542
F77_RET_T len
Definition: xerbla.cc:61