GNU Octave  9.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
regexp.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2002-2024 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <list>
31 #include <sstream>
32 
33 #include "base-list.h"
34 #include "oct-locbuf.h"
35 #include "quit.h"
36 #include "lo-regexp.h"
37 #include "str-vec.h"
38 
39 #include "defun.h"
40 #include "Cell.h"
41 #include "error.h"
42 #include "errwarn.h"
43 #include "oct-map.h"
44 #include "ovl.h"
45 #include "utils.h"
46 
48 
49 // Replace backslash escapes in a string with the real values. We need
50 // two special functions instead of the one in utils.cc because the set
51 // of escape sequences used for regexp patterns and replacement strings
52 // is different from those used in the *printf functions.
53 
54 static std::string
55 do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
56 {
57  std::string retval;
58 
59  std::size_t i = 0;
60  std::size_t j = 0;
61  std::size_t len = s.length ();
62 
63  retval.resize (len);
64 
65  while (j < len)
66  {
67  if (s[j] == '\\' && j+1 < len)
68  {
69  switch (s[++j])
70  {
71  case 'b': // backspace
72  if (is_sq_str)
73  retval[i] = '\b';
74  else
75  {
76  // Pass escape sequence through
77  retval[i] = '\\';
78  retval[++i] = 'b';
79  }
80  break;
81 
82  // Translate < and > to PCRE patterns for pseudo-word boundary
83  case '<': // begin word boundary
84  retval.insert (i, "(?<=\\W|^)");
85  i += 8;
86  break;
87 
88  case '>': // end word boundary
89  retval.insert (i, "(?=\\W|$)");
90  i += 7;
91  break;
92 
93  case 'o': // octal input
94  {
95  bool bad_esc_seq = (j+1 >= len);
96 
97  bool brace = false;
98  if (! bad_esc_seq && s[++j] == '{')
99  {
100  brace = true;
101  j++;
102  }
103 
104  int tmpi = 0;
105  std::size_t k;
106  for (k = j; k < std::min (j+3+brace, len); k++)
107  {
108  int digit = s[k] - '0';
109  if (digit < 0 || digit > 7)
110  break;
111  tmpi <<= 3;
112  tmpi += digit;
113  }
114  if (bad_esc_seq || (brace && s[k++] != '}'))
115  {
116  tmpi = 0;
117  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
118  }
119  retval[i] = tmpi;
120  j = k - 1;
121  break;
122  }
123 
124  default: // pass escape sequence through
125  retval[i] = '\\';
126  retval[++i] = s[j];
127  break;
128  }
129  }
130  else
131  {
132  retval[i] = s[j];
133  }
134 
135  i++;
136  j++;
137  }
138 
139  retval.resize (i);
140 
141  return retval;
142 }
143 
144 static std::string
145 do_regexp_rep_string_escapes (const std::string& s)
146 {
147  std::string retval;
148 
149  std::size_t i = 0;
150  std::size_t j = 0;
151  std::size_t len = s.length ();
152 
153  retval.resize (len);
154 
155  while (j < len)
156  {
157  if (s[j] == '\\' && j+1 < len)
158  {
159  switch (s[++j])
160  {
161  case 'a': // alarm
162  retval[i] = '\a';
163  break;
164 
165  case 'b': // backspace
166  retval[i] = '\b';
167  break;
168 
169  case 'f': // formfeed
170  retval[i] = '\f';
171  break;
172 
173  case 'n': // newline
174  retval[i] = '\n';
175  break;
176 
177  case 'r': // carriage return
178  retval[i] = '\r';
179  break;
180 
181  case 't': // horizontal tab
182  retval[i] = '\t';
183  break;
184 
185  case 'v': // vertical tab
186  retval[i] = '\v';
187  break;
188 
189  case '0':
190  case '1':
191  case '2':
192  case '3':
193  case '4':
194  case '5':
195  case '6':
196  case '7': // octal input
197  {
198  std::size_t k;
199  int tmpi = s[j] - '0';
200  for (k = j+1; k < std::min (j+3, len); k++)
201  {
202  int digit = s[k] - '0';
203  if (digit < 0 || digit > 7)
204  break;
205  tmpi <<= 3;
206  tmpi += digit;
207  }
208  retval[i] = tmpi;
209  j = k - 1;
210  break;
211  }
212 
213  case 'o': // octal input
214  {
215  bool bad_esc_seq = (j+1 >= len);
216 
217  bool brace = false;
218  if (! bad_esc_seq && s[++j] == '{')
219  {
220  brace = true;
221  j++;
222  }
223 
224  int tmpi = 0;
225  std::size_t k;
226  for (k = j; k < std::min (j+3+brace, len); k++)
227  {
228  int digit = s[k] - '0';
229  if (digit < 0 || digit > 7)
230  break;
231  tmpi <<= 3;
232  tmpi += digit;
233  }
234  if (bad_esc_seq || (brace && s[k++] != '}'))
235  {
236  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
237  tmpi = 0;
238  }
239  retval[i] = tmpi;
240  j = k - 1;
241  break;
242  }
243 
244  case 'x': // hex input
245  {
246  bool bad_esc_seq = (j+1 >= len);
247 
248  bool brace = false;
249  if (! bad_esc_seq && s[++j] == '{')
250  {
251  brace = true;
252  j++;
253  }
254 
255  int tmpi = 0;
256  std::size_t k;
257  for (k = j; k < std::min (j+2+brace, len); k++)
258  {
259  if (! isxdigit (s[k]))
260  break;
261 
262  tmpi <<= 4;
263  int digit = s[k];
264  if (digit >= 'a')
265  tmpi += digit - 'a' + 10;
266  else if (digit >= 'A')
267  tmpi += digit - 'A' + 10;
268  else
269  tmpi += digit - '0';
270  }
271  if (bad_esc_seq || (brace && s[k++] != '}'))
272  {
273  warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
274  tmpi = 0;
275  }
276  retval[i] = tmpi;
277  j = k - 1;
278  break;
279  }
280 
281  // Both dollar sign (for capture buffer) and backslash are
282  // passed through with their escape backslash. The processing
283  // for these must occur during the actual replacement operation
284  // in lo-regexp.cc.
285  case '$': // pass dollar sign through with escape
286  retval[i] = '\\'; retval[++i] = '$';
287  break;
288 
289  case '\\': // pass backslash through with escape
290  retval[i] = '\\'; retval[++i] = '\\';
291  break;
292 
293  default: // convert escaped character to unescaped char
294  retval[i] = s[j];
295  break;
296  }
297  }
298  else
299  {
300  retval[i] = s[j];
301  }
302 
303  i++;
304  j++;
305  }
306 
307  retval.resize (i);
308 
309  return retval;
310 }
311 
312 static void
313 parse_options (regexp::opts& options, const octave_value_list& args,
314  const std::string& who, int skip, bool& extra_args)
315 {
316  extra_args = false;
317 
318  for (int i = skip; i < args.length (); i++)
319  {
320  std::string str;
321 
322  str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());
323 
324  std::transform (str.begin (), str.end (), str.begin (), tolower);
325 
326  if (str.find ("once", 0) == 0)
327  options.once (true);
328  else if (str.find ("matchcase", 0) == 0)
329  options.case_insensitive (false);
330  else if (str.find ("ignorecase", 0) == 0)
331  options.case_insensitive (true);
332  else if (str.find ("dotall", 0) == 0)
333  options.dotexceptnewline (false);
334  else if (str.find ("stringanchors", 0) == 0)
335  options.lineanchors (false);
336  else if (str.find ("literalspacing", 0) == 0)
337  options.freespacing (false);
338  else if (str.find ("noemptymatch", 0) == 0)
339  options.emptymatch (false);
340  else if (str.find ("dotexceptnewline", 0) == 0)
341  options.dotexceptnewline (true);
342  else if (str.find ("lineanchors", 0) == 0)
343  options.lineanchors (true);
344  else if (str.find ("freespacing", 0) == 0)
345  options.freespacing (true);
346  else if (str.find ("emptymatch", 0) == 0)
347  options.emptymatch (true);
348  else if (str.find ("start", 0) == 0
349  || str.find ("end", 0) == 0
350  || str.find ("tokenextents", 0) == 0
351  || str.find ("match", 0) == 0
352  || str.find ("tokens", 0) == 0
353  || str.find ("names", 0) == 0
354  || str.find ("split", 0) == 0)
355  extra_args = true;
356  else
357  error ("%s: unrecognized option", who.c_str ());
358  }
359 }
360 
361 static octave_value_list
362 octregexp (const octave_value_list& args, int nargout,
363  const std::string& who, bool case_insensitive = false)
364 {
365  octave_value_list retval;
366 
367  int nargin = args.length ();
368 
369  // Make sure we have string, pattern
370  const std::string buffer = args(0).string_value ();
371 
372  std::string pattern = args(1).string_value ();
373 
374  // Rewrite pattern for PCRE
375  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
376 
377  regexp::opts options;
378  options.case_insensitive (case_insensitive);
379  bool extra_options = false;
380  parse_options (options, args, who, 2, extra_options);
381 
382  const regexp::match_data rx_lst
383  = regexp::match (pattern, buffer, options, who);
384 
385  string_vector named_pats = rx_lst.named_patterns ();
386 
387  std::size_t sz = rx_lst.size ();
388 
389  // Converted the linked list in the correct form for the return values
390 
391  octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);
392 
393  retval.resize (7);
394 
395  if (sz != 0)
396  {
397  for (int j = 0; j < named_pats.numel (); j++)
398  {
399  Cell ctmp (dim_vector (1, sz));
400  octave_idx_type i = 0;
401 
402  for (const auto& match_data : rx_lst)
403  {
404  string_vector named_tokens = match_data.named_tokens ();
405 
406  ctmp(i++) = named_tokens(j);
407  }
408 
409  nmap.assign (named_pats(j), ctmp);
410  }
411  }
412  retval(5) = nmap;
413 
414  if (options.once ())
415  {
416  auto p = rx_lst.begin ();
417 
418  retval(4) = (sz ? p->tokens () : Cell ());
419  retval(3) = (sz ? p->match_string () : "");
420  retval(2) = (sz ? p->token_extents () : Matrix ());
421 
422  if (sz)
423  {
424  double start = p->start ();
425  double end = p->end ();
426 
427  Cell split (dim_vector (1, 2));
428  split(0) = buffer.substr (0, start-1);
429  split(1) = buffer.substr (end);
430 
431  retval(6) = split;
432  retval(1) = end;
433  retval(0) = start;
434  }
435  else
436  {
437  retval(6) = buffer;
438  retval(1) = Matrix ();
439  retval(0) = Matrix ();
440  }
441  }
442  else
443  {
444  Cell tokens (dim_vector (1, sz));
445  Cell match_string (dim_vector (1, sz));
446  Cell token_extents (dim_vector (1, sz));
447  NDArray end (dim_vector (1, sz));
448  NDArray start (dim_vector (1, sz));
449  Cell split (dim_vector (1, sz+1));
450  std::size_t sp_start = 0;
451 
452  octave_idx_type i = 0;
453  for (const auto& match_data : rx_lst)
454  {
455  double s = match_data.start ();
456  double e = match_data.end ();
457 
458  string_vector tmp = match_data.tokens ();
459  tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
460  match_string(i) = match_data.match_string ();
461  token_extents(i) = match_data.token_extents ();
462  end(i) = e;
463  start(i) = s;
464  split(i) = buffer.substr (sp_start, s-sp_start-1);
465  sp_start = e;
466  i++;
467  }
468 
469  split(i) = buffer.substr (sp_start);
470 
471  retval(6) = split;
472  retval(4) = tokens;
473  retval(3) = match_string;
474  retval(2) = token_extents;
475  retval(1) = end;
476  retval(0) = start;
477  }
478 
479  // Alter the order of the output arguments
480 
481  if (extra_options)
482  {
483  int n = 0;
484  octave_value_list new_retval;
485  new_retval.resize (nargout);
486 
487  bool arg_used[7] {};
488 
489  for (int j = 2; j < nargin; j++)
490  {
491  int k = 0;
492  std::string str = args(j).string_value ();
493  std::transform (str.begin (), str.end (), str.begin (), tolower);
494 
495  if (str.find ("once", 0) == 0
496  || str.find ("stringanchors", 0) == 0
497  || str.find ("lineanchors", 0) == 0
498  || str.find ("matchcase", 0) == 0
499  || str.find ("ignorecase", 0) == 0
500  || str.find ("dotall", 0) == 0
501  || str.find ("dotexceptnewline", 0) == 0
502  || str.find ("literalspacing", 0) == 0
503  || str.find ("freespacing", 0) == 0
504  || str.find ("noemptymatch", 0) == 0
505  || str.find ("emptymatch", 0) == 0)
506  continue;
507  else if (str.find ("start", 0) == 0)
508  k = 0;
509  else if (str.find ("end", 0) == 0)
510  k = 1;
511  else if (str.find ("tokenextents", 0) == 0)
512  k = 2;
513  else if (str.find ("match", 0) == 0)
514  k = 3;
515  else if (str.find ("tokens", 0) == 0)
516  k = 4;
517  else if (str.find ("names", 0) == 0)
518  k = 5;
519  else if (str.find ("split", 0) == 0)
520  k = 6;
521 
522  new_retval(n++) = retval(k);
523  arg_used[k] = true;
524 
525  if (n == nargout)
526  break;
527  }
528 
529  // Fill in the rest of the arguments
530  if (n < nargout)
531  {
532  for (int j = 0; j < 7; j++)
533  {
534  if (! arg_used[j])
535  new_retval(n++) = retval(j);
536  }
537  }
538 
539  retval = new_retval;
540  }
541 
542  return retval;
543 }
544 
545 static octave_value_list
546 octcellregexp (const octave_value_list& args, int nargout,
547  const std::string& who, bool case_insensitive = false)
548 {
549  octave_value_list retval;
550 
551  if (args(0).iscell ())
552  {
553  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
554  octave_value_list new_args = args;
555  Cell cellstr = args(0).cell_value ();
556  if (args(1).iscell ())
557  {
558  Cell cellpat = args(1).cell_value ();
559 
560  if (cellpat.numel () == 1)
561  {
562  for (int j = 0; j < nargout; j++)
563  newretval[j].resize (cellstr.dims ());
564 
565  new_args(1) = cellpat(0);
566 
567  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
568  {
569  new_args(0) = cellstr(i);
570  octave_value_list tmp = octregexp (new_args, nargout, who,
571  case_insensitive);
572 
573  for (int j = 0; j < nargout; j++)
574  newretval[j](i) = tmp(j);
575  }
576  }
577  else if (cellstr.numel () == 1)
578  {
579  for (int j = 0; j < nargout; j++)
580  newretval[j].resize (cellpat.dims ());
581 
582  new_args(0) = cellstr(0);
583 
584  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
585  {
586  new_args(1) = cellpat(i);
587  octave_value_list tmp = octregexp (new_args, nargout, who,
588  case_insensitive);
589 
590  for (int j = 0; j < nargout; j++)
591  newretval[j](i) = tmp(j);
592  }
593  }
594  else if (cellstr.numel () == cellpat.numel ())
595  {
596  if (cellstr.dims () != cellpat.dims ())
597  error ("%s: inconsistent cell array dimensions", who.c_str ());
598 
599  for (int j = 0; j < nargout; j++)
600  newretval[j].resize (cellstr.dims ());
601 
602  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
603  {
604  new_args(0) = cellstr(i);
605  new_args(1) = cellpat(i);
606 
607  octave_value_list tmp = octregexp (new_args, nargout, who,
608  case_insensitive);
609 
610  for (int j = 0; j < nargout; j++)
611  newretval[j](i) = tmp(j);
612  }
613  }
614  else
615  error ("regexp: cell array arguments must be scalar or equal size");
616  }
617  else
618  {
619  for (int j = 0; j < nargout; j++)
620  newretval[j].resize (cellstr.dims ());
621 
622  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
623  {
624  new_args(0) = cellstr(i);
625  octave_value_list tmp = octregexp (new_args, nargout, who,
626  case_insensitive);
627 
628  for (int j = 0; j < nargout; j++)
629  newretval[j](i) = tmp(j);
630  }
631  }
632 
633  for (int j = 0; j < nargout; j++)
634  retval(j) = octave_value (newretval[j]);
635  }
636  else if (args(1).iscell ())
637  {
638  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
639  octave_value_list new_args = args;
640  Cell cellpat = args(1).cell_value ();
641 
642  for (int j = 0; j < nargout; j++)
643  newretval[j].resize (cellpat.dims ());
644 
645  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
646  {
647  new_args(1) = cellpat(i);
648  octave_value_list tmp = octregexp (new_args, nargout, who,
649  case_insensitive);
650 
651  for (int j = 0; j < nargout; j++)
652  newretval[j](i) = tmp(j);
653  }
654 
655  for (int j = 0; j < nargout; j++)
656  retval(j) = octave_value (newretval[j]);
657  }
658  else
659  retval = octregexp (args, nargout, who, case_insensitive);
660 
661  return retval;
662 
663 }
664 
665 DEFUN (regexp, args, nargout,
666  doc: /* -*- texinfo -*-
667 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
668 @deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
669 Regular expression string matching.
670 
671 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
672 substrings of any matches, or empty values if there are none.
673 
674 The matched pattern @var{pat} can include any of the standard regex
675 operators, including:
676 
677 @table @code
678 @item .
679 Match any character
680 
681 @item * + ? @{@}
682 Repetition operators, representing
683 
684 @table @code
685 @item *
686 Match zero or more times
687 
688 @item +
689 Match one or more times
690 
691 @item ?
692 Match zero or one times
693 
694 @item @{@var{n}@}
695 Match exactly @var{n} times
696 
697 @item @{@var{n},@}
698 Match @var{n} or more times
699 
700 @item @{@var{m},@var{n}@}
701 Match between @var{m} and @var{n} times
702 @end table
703 
704 @item [@dots{}] [^@dots{}]
705 
706 List operators. The pattern will match any character listed between
707 @qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the
708 pattern is inverted and any character except those listed between brackets
709 will match.
710 
711 Escape sequences defined below can also be used inside list operators. For
712 example, a template for a floating point number might be @code{[-+.\d]+}.
713 
714 @item () (?:)
715 Grouping operator. The first form, parentheses only, also creates a token.
716 
717 @item |
718 Alternation operator. Match one of a choice of regular expressions. The
719 alternatives must be delimited by the grouping operator @code{()} above.
720 
721 @item ^ $
722 Anchoring operators. Requires pattern to occur at the start (@code{^}) or
723 end (@code{$}) of the string.
724 @end table
725 
726 In addition, the following escaped characters have special meaning.
727 
728 @table @code
729 
730 @item \d
731 Match any digit
732 
733 @item \D
734 Match any non-digit
735 
736 @item \s
737 Match any whitespace character
738 
739 @item \S
740 Match any non-whitespace character
741 
742 @item \w
743 Match any word character
744 
745 @item \W
746 Match any non-word character
747 
748 @item <
749 Match the beginning of a word
750 
751 @item >
752 Match the end of a word
753 
754 @item \B
755 Match within a word
756 @end table
757 
758 Implementation Note: For compatibility with @sc{matlab}, escape sequences
759 in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
760 even when @var{pat} has been defined with single quotes. To disable
761 expansion use a second backslash before the escape sequence (e.g.,
762 "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
763 function.
764 
765 The outputs of @code{regexp} default to the order given below
766 
767 @table @var
768 @item s
769 The start indices of each matching substring
770 
771 @item e
772 The end indices of each matching substring
773 
774 @item te
775 The extents of each matched token surrounded by @code{(@dots{})} in
776 @var{pat}
777 
778 @item m
779 A cell array of the text of each match
780 
781 @item t
782 A cell array of the text of each token matched
783 
784 @item nm
785 A structure containing the text of each matched named token, with the name
786 being used as the fieldname. A named token is denoted by
787 @code{(?<name>@dots{})}.
788 
789 @item sp
790 A cell array of the text not returned by match, i.e., what remains if you
791 split the string based on @var{pat}.
792 @end table
793 
794 Particular output arguments, or the order of the output arguments, can be
795 selected by additional @var{opt} arguments. These are strings and the
796 correspondence between the output arguments and the optional argument
797 are
798 
799 @multitable @columnfractions 0.2 0.3 0.3 0.2
800 @item @tab @qcode{'start'} @tab @var{s} @tab
801 @item @tab @qcode{'end'} @tab @var{e} @tab
802 @item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
803 @item @tab @qcode{'match'} @tab @var{m} @tab
804 @item @tab @qcode{'tokens'} @tab @var{t} @tab
805 @item @tab @qcode{'names'} @tab @var{nm} @tab
806 @item @tab @qcode{'split'} @tab @var{sp} @tab
807 @end multitable
808 
809 Additional arguments are summarized below.
810 
811 @table @samp
812 @item once
813 Return only the first occurrence of the pattern.
814 
815 @item matchcase
816 Make the matching case sensitive. (default)
817 
818 Alternatively, use (?-i) in the pattern.
819 
820 @item ignorecase
821 Ignore case when matching the pattern to the string.
822 
823 Alternatively, use (?i) in the pattern.
824 
825 @item stringanchors
826 Match the anchor characters at the beginning and end of the string.
827 (default)
828 
829 Alternatively, use (?-m) in the pattern.
830 
831 @item lineanchors
832 Match the anchor characters at the beginning and end of the line.
833 
834 Alternatively, use (?m) in the pattern.
835 
836 @item dotall
837 The pattern @code{.} matches all characters including the newline character.
838  (default)
839 
840 Alternatively, use (?s) in the pattern.
841 
842 @item dotexceptnewline
843 The pattern @code{.} matches all characters except the newline character.
844 
845 Alternatively, use (?-s) in the pattern.
846 
847 @item literalspacing
848 All characters in the pattern, including whitespace, are significant and are
849 used in pattern matching. (default)
850 
851 Alternatively, use (?-x) in the pattern.
852 
853 @item freespacing
854 The pattern may include arbitrary whitespace and also comments beginning
855 with the character @samp{#}.
856 
857 Alternatively, use (?x) in the pattern.
858 
859 @item noemptymatch
860 Zero-length matches are not returned. (default)
861 
862 @item emptymatch
863 Return zero-length matches.
864 
865 @code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
866 are zero or more @qcode{'b'} characters at positions 1 and end-of-string.
867 
868 @end table
869 
870 Stack Limitation Note: Pattern searches are done with a recursive function
871 which can overflow the program stack when there are a high number of matches.
872 For example,
873 
874 @example
875 @code{regexp (repmat ('a', 1, 1e5), '(a)+')}
876 @end example
877 
878 @noindent
879 may lead to a segfault. As an alternative, consider constructing pattern
880 searches that reduce the number of matches (e.g., by creatively using set
881 complement), and then further processing the return variables (now reduced in
882 size) with successive @code{regexp} searches.
883 
884 Octave's @code{regexp} implementation is based on the Perl Compatible
885 Regular Expressions library (@url{https://www.pcre.org/}). For a more
886 comprehensive list of @code{regexp} operator syntax see the
887 @url{https://www.pcre.org/current/doc/html/pcre2syntax.html,,
888 "PCRE Syntax quick-reference summary"}.
889 
890 @seealso{regexpi, strfind, regexprep}
891 @end deftypefn */)
892 {
893  if (args.length () < 2)
894  print_usage ();
895 
896  octave_value_list retval;
897 
898  if (args(0).iscell () || args(1).iscell ())
899  retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
900  else
901  retval = octregexp (args, nargout, "regexp");
902 
903  return retval;
904 }
905 
906 /*
907 ## PCRE_ERROR_MATCHLIMIT test
908 %!test
909 %! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
910 %! ws = warning ("query");
911 %! unwind_protect
912 %! warning ("off");
913 %! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
914 %! unwind_protect_cleanup
915 %! warning (ws);
916 %! end_unwind_protect
917 
918 ## segfault test
919 %!assert (regexp ("abcde", "."), [1,2,3,4,5])
920 %!assert <*62704> (regexpi('(', '\‍(?'), 1)
921 ## Infinite loop test
922 %!assert (isempty (regexp ("abcde", "")))
923 
924 ## Check that anchoring of pattern works correctly
925 %!assert (regexp ('abcabc', '^abc'), 1)
926 %!assert (regexp ('abcabc', 'abc$'), 4)
927 %!assert (regexp ('abcabc', '^abc$'), zeros (1,0))
928 
929 ## UTF-8 test with character vector "âé🙂ïõù"
930 %!assert (regexp (char ([195, 162, 195, 169, 240, 159, 153, 130, 195, 175, ...
931 %! 195, 181, 195, 185]), "."), [1, 3, 5, 9, 11, 13])
932 
933 %!test
934 %! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
935 %! assert (s, zeros (1,0));
936 %! assert (e, zeros (1,0));
937 %! assert (te, cell (1,0));
938 %! assert (m, cell (1,0));
939 %! assert (t, cell (1,0));
940 
941 %!test
942 %! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
943 %! assert (s, zeros (1,0));
944 %! assert (e, zeros (1,0));
945 %! assert (te, cell (1,0));
946 %! assert (m, cell (1,0));
947 %! assert (t, cell (1,0));
948 
949 %!test
950 %! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
951 %! assert (s, 2);
952 %! assert (e, 10);
953 %! assert (te{1}, [3, 7]);
954 %! assert (m{1}, 'firetruck');
955 %! assert (t{1}{1}, 'iretr');
956 
957 %!test
958 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
959 %! assert (s, [1, 12]);
960 %! assert (e, [5, 17]);
961 %! assert (size (te), [1, 2]);
962 %! assert (isempty (te{1}));
963 %! assert (isempty (te{2}));
964 %! assert (m{1}, 'short');
965 %! assert (m{2}, 'string');
966 %! assert (size (t), [1, 2]);
967 %! assert (isempty (t{1}));
968 %! assert (isempty (t{2}));
969 
970 %!test
971 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
972 %! assert (s, 1);
973 %! assert (e, 5);
974 %! assert (isempty (te));
975 %! assert (m, 'short');
976 %! assert (isempty (t));
977 
978 %!test
979 %! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
980 %! assert (s, 1);
981 %! assert (e, 5);
982 %! assert (isempty (te));
983 %! assert (m, 'short');
984 %! assert (isempty (t));
985 
986 %!test
987 %! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
988 %! assert (s, 1);
989 %! assert (e, 10);
990 %! assert (size (te), [1, 1]);
991 %! assert (te{1}, [1,5; 7,10]);
992 %! assert (m{1}, 'short test');
993 %! assert (size (t), [1, 1]);
994 %! assert (t{1}{1}, 'short');
995 %! assert (t{1}{2}, 'test');
996 %! assert (size (nm), [1, 1]);
997 %! assert (! isempty (fieldnames (nm)));
998 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
999 %! assert (nm.word1, 'short');
1000 %! assert (nm.word2, 'test');
1001 
1002 %!test
1003 %! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1004 %! assert (s, 1);
1005 %! assert (e, 10);
1006 %! assert (size (te), [1, 1]);
1007 %! assert (te{1}, [1,5; 7,10]);
1008 %! assert (m{1}, 'short test');
1009 %! assert (size (t), [1, 1]);
1010 %! assert (t{1}{1}, 'short');
1011 %! assert (t{1}{2}, 'test');
1012 %! assert (size (nm), [1, 1]);
1013 %! assert (! isempty (fieldnames (nm)));
1014 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1015 %! assert (nm.word1, 'short');
1016 %! assert (nm.word2, 'test');
1017 
1018 %!test
1019 %! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
1020 %! assert (size (t), [1, 2]);
1021 %! assert (t{1}{1}, "John");
1022 %! assert (t{1}{2}, "Davis");
1023 %! assert (t{2}{1}, "Rogers");
1024 %! assert (t{2}{2}, "James");
1025 %! assert (size (nm), [1, 2]);
1026 %! assert (nm(1).first, "John");
1027 %! assert (nm(1).last, "Davis");
1028 %! assert (nm(2).first, "James");
1029 %! assert (nm(2).last, "Rogers");
1030 
1031 ## Tests for nulls in strings properly matching
1032 %!test
1033 %! str = "A\0B\0\0C";
1034 %! ptn = '(\0+)'; # also test null in single-quote pattern
1035 %! M = regexp (str, ptn, "match");
1036 %! assert (size (M), [1, 2]);
1037 %! assert (double (M{1}), [0]);
1038 %! assert (double (M{2}), [0, 0]);
1039 
1040 %!test
1041 %! str = "A\0B\0\0C";
1042 %! ptn = "(\0+)"; # also test null in double-quote pattern
1043 %! T = regexp (str, ptn, "tokens");
1044 %! assert (size (T), [1, 2]);
1045 %! assert (double (T{1}{1}), [0]);
1046 %! assert (double (T{2}{1}), [0, 0]);
1047 
1048 %!test
1049 %! str = "A\0B\0\0C";
1050 %! ptn = '(?<namedtoken>\0+)';
1051 %! NT = regexp (str, ptn, "names");
1052 %! assert (size (NT), [1, 2]);
1053 %! assert (double (NT(1).namedtoken), [0]);
1054 %! assert (double (NT(2).namedtoken), [0, 0]);
1055 
1056 ## Tests for named tokens
1057 %!test
1058 %! ## Parenthesis in named token (ie (int)) causes a problem
1059 %! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
1060 %! struct ('typestr', 'int'));
1061 
1062 %!test <*35683>
1063 %! ## Mix of named and unnamed tokens can cause segfault
1064 %! str = "abcde";
1065 %! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
1066 %! tokens = regexp (str, ptn, "names");
1067 %! assert (isstruct (tokens) && numel (tokens) == 1);
1068 %! assert (tokens.T1, "a");
1069 %! assert (tokens.T2, "de");
1070 
1071 ## Test options to regexp
1072 %!assert (regexp ("abc\nabc", '.'), [1:7])
1073 %!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
1074 %!test
1075 %! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
1076 %! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1077 %! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1078 
1079 %!assert (regexp ("caseCaSe", 'case'), 1)
1080 %!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
1081 %!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
1082 %!test
1083 %! assert (regexp ("caseCaSe", '(?-i)case'), 1);
1084 %! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);
1085 
1086 %!assert (regexp ("abc\nabc", 'c$'), 7)
1087 %!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
1088 %!test
1089 %! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
1090 %! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
1091 %! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);
1092 
1093 %!assert (regexp ("this word", 's w'), 4)
1094 %!assert (regexp ("this word", 's w', 'literalspacing'), 4)
1095 %!test
1096 %! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
1097 %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
1098 %! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
1099 
1100 %!test
1101 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
1102 %! assert (s, [1 5]);
1103 %! assert (e, [3 5]);
1104 %! assert (te, { zeros(0,2), zeros(0,2) });
1105 %! assert (m, { "OCT", "V" });
1106 %! assert (t, { cell(1,0), cell(1,0) });
1107 %! assert (isempty (fieldnames (nm)));
1108 %! assert (sp, { "", "A", "E" });
1109 
1110 %!test
1111 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
1112 %! assert (s, [1 5]);
1113 %! assert (e, [3 5]);
1114 %! assert (te, { [1 3], [5 5] });
1115 %! assert (m, { "OCT", "V" });
1116 %! assert (t, { {"OCT"}, {"V"} });
1117 %! assert (isempty (fieldnames (nm)));
1118 %! assert (sp, { "", "A", "E" });
1119 
1120 %!test
1121 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
1122 %! assert (s, [1 4 5 6 7]);
1123 %! assert (e, [3 3 5 5 6]);
1124 %! assert (te, repmat ({zeros(0,2)}, [1, 5]));
1125 %! assert (m, { "OCT", "", "V", "", "" });
1126 %! assert (t, repmat({cell(1,0)}, [1, 5]));
1127 %! assert (isempty (fieldnames (nm)));
1128 %! assert (sp, { "", "", "A", "", "E", "" });
1129 
1130 %!test
1131 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
1132 %! assert (s, [1 4 5 6 7]);
1133 %! assert (e, [3 3 5 5 6]);
1134 %! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
1135 %! assert (m, { "OCT", "", "V", "", "" });
1136 %! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
1137 %! assert (isempty (fieldnames (nm)));
1138 %! assert (sp, { "", "", "A", "", "E", "" });
1139 
1140 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1141 %! {6;[1,5,9];zeros(1,0)})
1142 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1143 %! {6;[3,7];[1,9]})
1144 %!assert (regexp ('Strings', {'t','s'}), {2, 7})
1145 
1146 ## Test case for lookaround operators
1147 %!test
1148 %! assert (regexp ('Iraq', 'q(?!u)'), 4);
1149 %! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
1150 %! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
1151 %! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
1152 %! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0));
1153 %! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'});
1154 %! assert (regexp ('thingamabob', '(?<=a)b'), 9);
1155 
1156 ## Tests for split option.
1157 %!shared str
1158 %! str = "foo bar foo";
1159 %!test
1160 %! [a, b] = regexp (str, "f..", "match", "split");
1161 %! assert (a, {"foo", "foo"});
1162 %! assert (b, {"", " bar ", ""});
1163 %!test
1164 %! [a, b] = regexp (str, "f..", "match", "split", "once");
1165 %! assert (a, "foo");
1166 %! assert (b, {"", " bar foo"});
1167 %!test
1168 %! [a, b] = regexp (str, "fx.", "match", "split");
1169 %! assert (a, cell (1, 0));
1170 %! assert (b, {"foo bar foo"});
1171 %!test
1172 %! [a, b] = regexp (str, "fx.", "match", "split", "once");
1173 %! assert (a, "");
1174 %! assert (b, "foo bar foo");
1175 
1176 %!shared str
1177 %! str = "foo bar";
1178 %!test
1179 %! [a, b] = regexp (str, "f..", "match", "split");
1180 %! assert (a, {"foo"});
1181 %! assert (b, {"", " bar"});
1182 %!test
1183 %! [a, b] = regexp (str, "b..", "match", "split");
1184 %! assert (a, {"bar"});
1185 %! assert (b, {"foo ", ""});
1186 %!test
1187 %! [a, b] = regexp (str, "x", "match", "split");
1188 %! assert (a, cell (1, 0));
1189 %! assert (b, {"foo bar"});
1190 %!test
1191 %! [a, b] = regexp (str, "[o]+", "match", "split");
1192 %! assert (a, {"oo"});
1193 %! assert (b, {"f", " bar"});
1194 
1195 ## Test escape sequences are expanded even in single-quoted strings
1196 %!assert (regexp ("\n", '\n'), 1)
1197 %!assert (regexp ("\n", "\n"), 1)
1198 
1199 ## Test escape sequences are silently converted
1200 %!test <*45407>
1201 %! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
1202 %! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
1203 %! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
1204 
1205 ## Test start-of-word / end-of-word patterns for Matlab compatibility
1206 %!test <*59992>
1207 %! assert (regexp ('foo!+bar', '<\w'), [1, 6]);
1208 %! assert (regexp ('foo!+bar', '.>'), [3, 4, 8]);
1209 %! assert (regexp ('foo!+bar\nbar!+foo', '.>'), [3, 4, 8, 13, 14, 18]);
1210 %! assert (regexp ('foo!+bar\nbar!+foo', '<\w'), [1, 6, 10, 16]);
1211 
1212 ## Test "incomplete" named patterns
1213 %!assert <*62705> (regexpi ('<', '\‍(?<'), 1)
1214 %!assert <*62705> (regexpi ('<n>', '\‍(?<n>'), 1)
1215 %!assert <*62705> (regexpi ('<n>', '\‍(?<n>\‍)?'), 1)
1216 %!assert <62705> (regexpi ('<n>a', '\‍(?<n>a\‍)?'), 1)
1217 
1218 ## Test input validation
1219 %!error regexp ('string', 'tri', 'BadArg')
1220 %!error regexp ('string')
1221 
1222 */
1223 
1224 DEFUN (regexpi, args, nargout,
1225  doc: /* -*- texinfo -*-
1226 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
1227 @deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})
1228 
1229 Case insensitive regular expression string matching.
1230 
1231 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
1232 substrings of any matches, or empty values if there are none.
1233 @xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search
1234 pattern.
1235 @seealso{regexp}
1236 @end deftypefn */)
1237 {
1238  if (args.length () < 2)
1239  print_usage ();
1240 
1241  if (args(0).iscell () || args(1).iscell ())
1242  return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
1243  else
1244  return octregexp (args, nargout, "regexpi", true);
1245 }
1246 
1247 /*
1248 ## segfault test
1249 %!assert (regexpi ("abcde", "."), [1,2,3,4,5])
1250 
1251 ## Check that anchoring of pattern works correctly
1252 %!assert (regexpi ('abcabc', '^ABC'), 1)
1253 %!assert (regexpi ('abcabc', 'ABC$'), 4)
1254 %!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))
1255 
1256 %!test
1257 %! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
1258 %! assert (s, zeros (1,0));
1259 %! assert (e, zeros (1,0));
1260 %! assert (te, cell (1,0));
1261 %! assert (m, cell (1,0));
1262 %! assert (t, cell (1,0));
1263 
1264 %!test
1265 %! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
1266 %! assert (s, 2);
1267 %! assert (e, 10);
1268 %! assert (te{1}, [3, 7]);
1269 %! assert (m{1}, 'FiRetrUck');
1270 %! assert (t{1}{1}, 'iRetr');
1271 
1272 %!test
1273 %! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
1274 %! assert (s, 2);
1275 %! assert (e, 10);
1276 %! assert (te{1}, [3, 7]);
1277 %! assert (m{1}, 'firetruck');
1278 %! assert (t{1}{1}, 'iretr');
1279 
1280 %!test
1281 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
1282 %! assert (s, [1, 12]);
1283 %! assert (e, [5, 17]);
1284 %! assert (size (te), [1, 2]);
1285 %! assert (isempty (te{1}));
1286 %! assert (isempty (te{2}));
1287 %! assert (m{1}, 'ShoRt');
1288 %! assert (m{2}, 'String');
1289 %! assert (size (t), [1, 2]);
1290 %! assert (isempty (t{1}));
1291 %! assert (isempty (t{2}));
1292 
1293 %!test
1294 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
1295 %! assert (s, 1);
1296 %! assert (e, 5);
1297 %! assert (isempty (te));
1298 %! assert (m, 'ShoRt');
1299 %! assert (isempty (t));
1300 
1301 %!test
1302 %! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1303 %! assert (s, 1);
1304 %! assert (e, 5);
1305 %! assert (isempty (te));
1306 %! assert (m, 'ShoRt');
1307 %! assert (isempty (t));
1308 
1309 %!test
1310 %! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
1311 %! assert (s, 1);
1312 %! assert (e, 10);
1313 %! assert (size (te), [1, 1]);
1314 %! assert (te{1}, [1,5; 7,10]);
1315 %! assert (m{1}, 'ShoRt Test');
1316 %! assert (size (t), [1, 1]);
1317 %! assert (t{1}{1}, 'ShoRt');
1318 %! assert (t{1}{2}, 'Test');
1319 %! assert (size (nm), [1, 1]);
1320 %! assert (! isempty (fieldnames (nm)));
1321 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1322 %! assert (nm.word1, 'ShoRt');
1323 %! assert (nm.word2, 'Test');
1324 
1325 %!test
1326 %! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1327 %! assert (s, 1);
1328 %! assert (e, 10);
1329 %! assert (size (te), [1, 1]);
1330 %! assert (te{1}, [1,5; 7,10]);
1331 %! assert (m{1}, 'ShoRt Test');
1332 %! assert (size (t), [1, 1]);
1333 %! assert (t{1}{1}, 'ShoRt');
1334 %! assert (t{1}{2}, 'Test');
1335 %! assert (size (nm), [1, 1]);
1336 %! assert (! isempty (fieldnames (nm)));
1337 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1338 %! assert (nm.word1, 'ShoRt');
1339 %! assert (nm.word2, 'Test');
1340 
1341 %!assert (regexpi ("abc\nabc", '.'), [1:7])
1342 %!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
1343 %!test
1344 %! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
1345 %! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1346 %! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1347 
1348 %!assert (regexpi ("caseCaSe", 'case'), [1, 5])
1349 %!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
1350 %!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
1351 %!test
1352 %! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
1353 %! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);
1354 
1355 %!assert (regexpi ("abc\nabc", 'C$'), 7)
1356 %!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
1357 %!test
1358 %! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
1359 %! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
1360 %! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);
1361 
1362 %!assert (regexpi ("this word", 'S w'), 4)
1363 %!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
1364 %!test
1365 %! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
1366 %! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
1367 %! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));
1368 
1369 %!error regexpi ('string', 'tri', 'BadArg')
1370 %!error regexpi ('string')
1371 
1372 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1373 %! {6;[1,5,9];zeros(1, 0)})
1374 %!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'),
1375 %! {6, [1,5,9], zeros(1,0)})
1376 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1377 %! {6;[3,7];[1,9]})
1378 %!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})
1379 
1380 %!assert (regexpi ("\n", '\n'), 1)
1381 %!assert (regexpi ("\n", "\n"), 1)
1382 */
1383 
1384 static octave_value
1385 octregexprep (const octave_value_list& args, const std::string& who)
1386 {
1387  int nargin = args.length ();
1388 
1389  // Make sure we have string, pattern, replacement
1390  const std::string buffer = args(0).string_value ();
1391 
1392  std::string pattern = args(1).string_value ();
1393 
1394  // Rewrite pattern for PCRE
1395  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
1396 
1397  std::string replacement = args(2).string_value ();
1398 
1399  // Matlab compatibility.
1400  if (args(2).is_sq_string ())
1401  replacement = do_regexp_rep_string_escapes (replacement);
1402 
1403  // Pack options excluding 'tokenize' and various output
1404  // reordering strings into regexp arg list
1405  octave_value_list regexpargs (nargin-3, octave_value ());
1406 
1407  int len = 0;
1408  for (int i = 3; i < nargin; i++)
1409  {
1410  const std::string opt = args(i).string_value ();
1411  if (opt != "tokenize" && opt != "start" && opt != "end"
1412  && opt != "tokenextents" && opt != "match" && opt != "tokens"
1413  && opt != "names" && opt != "split" && opt != "warnings")
1414  {
1415  regexpargs(len++) = args(i);
1416  }
1417  }
1418  regexpargs.resize (len);
1419 
1420  regexp::opts options;
1421  bool extra_args = false;
1422  parse_options (options, regexpargs, who, 0, extra_args);
1423 
1424  return regexp::replace (pattern, buffer, replacement, options, who);
1425 }
1426 
1427 DEFUN (regexprep, args, ,
1428  doc: /* -*- texinfo -*-
1429 @deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
1430 @deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
1431 Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.
1432 
1433 The pattern is a regular expression as documented for @code{regexp}.
1434 @xref{XREFregexp,,@code{regexp}}.
1435 
1436 All strings must be UTF-8 encoded.
1437 
1438 The replacement string may contain @code{$i}, which substitutes for the ith
1439 set of parentheses in the match string. For example,
1440 
1441 @example
1442 regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
1443 @end example
1444 
1445 @noindent
1446 returns @qcode{"Dunn, Bill"}
1447 
1448 Options in addition to those of @code{regexp} are
1449 
1450 @table @samp
1451 
1452 @item once
1453 Replace only the first occurrence of @var{pat} in the result.
1454 
1455 @item warnings
1456 This option is present for compatibility but is ignored.
1457 
1458 @end table
1459 
1460 Implementation Note: For compatibility with @sc{matlab}, escape sequences
1461 in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
1462 even when @var{pat} has been defined with single quotes. To disable
1463 expansion use a second backslash before the escape sequence (e.g.,
1464 "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
1465 function.
1466 @seealso{regexp, regexpi, strrep}
1467 @end deftypefn */)
1468 {
1469  if (args.length () < 3)
1470  print_usage ();
1471 
1472  octave_value_list retval;
1473 
1474  if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
1475  {
1476  Cell str, pat, rep;
1477  dim_vector dv0;
1478  dim_vector dv1 (1, 1);
1479 
1480  if (args(0).iscell ())
1481  str = args(0).cell_value ();
1482  else
1483  str = Cell (args(0));
1484 
1485  if (args(1).iscell ())
1486  pat = args(1).cell_value ();
1487  else
1488  pat = Cell (args(1));
1489 
1490  if (args(2).iscell ())
1491  rep = args(2).cell_value ();
1492  else
1493  rep = Cell (args(2));
1494 
1495  dv0 = str.dims ();
1496  if (pat.numel () != 1)
1497  {
1498  dv1 = pat.dims ();
1499  if (rep.numel () != 1 && dv1 != rep.dims ())
1500  error ("regexprep: inconsistent cell array dimensions");
1501  }
1502  else if (rep.numel () != 1)
1503  dv1 = rep.dims ();
1504 
1505  Cell ret (dv0);
1506  octave_value_list new_args = args;
1507 
1508  for (octave_idx_type i = 0; i < dv0.numel (); i++)
1509  {
1510  new_args(0) = str(i);
1511  if (pat.numel () == 1)
1512  new_args(1) = pat(0);
1513  if (rep.numel () == 1)
1514  new_args(2) = rep(0);
1515 
1516  for (octave_idx_type j = 0; j < dv1.numel (); j++)
1517  {
1518  if (pat.numel () != 1)
1519  new_args(1) = pat(j);
1520  if (rep.numel () != 1)
1521  new_args(2) = rep(j);
1522  new_args(0) = octregexprep (new_args, "regexprep");
1523  }
1524 
1525  ret(i) = new_args(0);
1526  }
1527 
1528  retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
1529  }
1530  else
1531  retval = octregexprep (args, "regexprep");
1532 
1533  return retval;
1534 }
1535 
1536 /*
1537 %!test # Replace with empty
1538 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1539 %! t = regexprep (xml, '<[!?][^>]*>', '');
1540 %! assert (t, ' <tag v="hello">some stuff</tag>');
1541 
1542 %!test # Replace with non-empty
1543 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1544 %! t = regexprep (xml, '<[!?][^>]*>', '?');
1545 %! assert (t, '? <tag v="hello">some stuff?</tag>');
1546 
1547 %!test # Check that 'tokenize' is ignored
1548 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1549 %! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
1550 %! assert (t, ' <tag v="hello">some stuff</tag>');
1551 
1552 ## Test capture replacement
1553 %!test
1554 %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
1555 %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
1556 %! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
1557 %! assert (t, result);
1558 
1559 ## Return the original if no match
1560 %!assert (regexprep ('hello', 'world', 'earth'), 'hello')
1561 
1562 ## Test emptymatch option
1563 %!assert (regexprep ('World', '^', 'Hello '), 'World')
1564 %!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
1565 
1566 ## Test a general replacement
1567 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")
1568 
1569 ## Make sure replacements work at the beginning and end of string
1570 %!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
1571 %!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")
1572 
1573 ## Test options "once" and "ignorecase"
1574 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
1575 %! "a_b]c{d}e-f=g")
1576 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
1577 %! "a_b_c_d_e_f_g")
1578 
1579 ## Option combinations
1580 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
1581 %! "a_b]c{d}e-f=g")
1582 
1583 ## End conditions on replacement
1584 %!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
1585 %!assert (regexprep ("abc", "(b)", "$1"), "abc")
1586 %!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
1587 %!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")
1588 
1589 ## Test cell array arguments
1590 %!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
1591 %!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
1592 %!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})
1593 
1594 ## Nasty lookbehind expression
1595 %!test
1596 %! warning ("off", "Octave:regexp-lookbehind-limit", "local");
1597 %! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\‍(\-[1-9]*\‍)',
1598 %! '_minus1'),'x^(-1)+y_minus1+z_minus1=0');
1599 
1600 ## Verify escape sequences in pattern
1601 %!assert (regexprep ("\n", '\n', "X"), "X")
1602 %!assert (regexprep ("\n", "\n", "X"), "X")
1603 
1604 ## Verify NULLs in pattern and replacement string
1605 %!assert (regexprep ("A\0A", "\0", ","), "A,A")
1606 %!assert (regexprep ("A\0A", '\0', ","), "A,A")
1607 %!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
1608 %!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")
1609 
1610 ## Empty matches were broken on ARM architecture
1611 %!test <*52810>
1612 %! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"),
1613 %! "\nabc"));
1614 */
1615 
1616 OCTAVE_END_NAMESPACE(octave)
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
const dim_vector & dims() const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:503
octave_idx_type numel() const
Number of elements in the array.
Definition: Array.h:414
Definition: Cell.h:43
Definition: dMatrix.h:42
std::size_t size() const
Definition: base-list.h:52
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
octave_idx_type numel(int n=0) const
Number of elements that a matrix with this dimensions would have.
Definition: dim-vector.h:335
void resize(octave_idx_type n, const octave_value &rfv=octave_value())
Definition: ovl.h:117
Cell cell_value() const
Definition: ovl.h:105
octave_idx_type length() const
Definition: ovl.h:113
string_vector named_patterns() const
Definition: lo-regexp.h:213
void dotexceptnewline(bool val)
Definition: lo-regexp.h:140
void lineanchors(bool val)
Definition: lo-regexp.h:143
void case_insensitive(bool val)
Definition: lo-regexp.h:139
void freespacing(bool val)
Definition: lo-regexp.h:142
void emptymatch(bool val)
Definition: lo-regexp.h:141
void once(bool val)
Definition: lo-regexp.h:144
std::string replace(const std::string &buffer, const std::string &replacement) const
Definition: lo-regexp.cc:612
match_data match(const std::string &buffer) const
Definition: lo-regexp.cc:328
octave_idx_type numel() const
Definition: str-vec.h:100
OCTAVE_BEGIN_NAMESPACE(octave) static octave_value daspk_fcn
void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void warning(const char *fmt,...)
Definition: error.cc:1063
void() error(const char *fmt,...)
Definition: error.cc:988
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5468
octave_idx_type n
Definition: mx-inlines.cc:761
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value_list ovl(const OV_Args &... args)
Construct an octave_value_list with less typing.
Definition: ovl.h:219
F77_RET_T len
Definition: xerbla.cc:61