GNU Octave  8.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
regexp.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2002-2023 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <list>
31 #include <sstream>
32 
33 #include "base-list.h"
34 #include "oct-locbuf.h"
35 #include "quit.h"
36 #include "lo-regexp.h"
37 #include "str-vec.h"
38 
39 #include "defun.h"
40 #include "Cell.h"
41 #include "error.h"
42 #include "errwarn.h"
43 #include "oct-map.h"
44 #include "ovl.h"
45 #include "utils.h"
46 
48 
49 // Replace backslash escapes in a string with the real values. We need
50 // two special functions instead of the one in utils.cc because the set
51 // of escape sequences used for regexp patterns and replacement strings
52 // is different from those used in the *printf functions.
53 
54 static std::string
55 do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
56 {
57  std::string retval;
58 
59  std::size_t i = 0;
60  std::size_t j = 0;
61  std::size_t len = s.length ();
62 
63  retval.resize (len);
64 
65  while (j < len)
66  {
67  if (s[j] == '\\' && j+1 < len)
68  {
69  switch (s[++j])
70  {
71  case 'b': // backspace
72  if (is_sq_str)
73  retval[i] = '\b';
74  else
75  {
76  // Pass escape sequence through
77  retval[i] = '\\';
78  retval[++i] = 'b';
79  }
80  break;
81 
82  // Translate < and > to PCRE patterns for pseudo-word boundary
83  case '<': // begin word boundary
84  retval.insert (i, "(?<=\\W|^)");
85  i += 8;
86  break;
87 
88  case '>': // end word boundary
89  retval.insert (i, "(?=\\W|$)");
90  i += 7;
91  break;
92 
93  case 'o': // octal input
94  {
95  bool bad_esc_seq = (j+1 >= len);
96 
97  bool brace = false;
98  if (! bad_esc_seq && s[++j] == '{')
99  {
100  brace = true;
101  j++;
102  }
103 
104  int tmpi = 0;
105  std::size_t k;
106  for (k = j; k < std::min (j+3+brace, len); k++)
107  {
108  int digit = s[k] - '0';
109  if (digit < 0 || digit > 7)
110  break;
111  tmpi <<= 3;
112  tmpi += digit;
113  }
114  if (bad_esc_seq || (brace && s[k++] != '}'))
115  {
116  tmpi = 0;
117  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
118  }
119  retval[i] = tmpi;
120  j = k - 1;
121  break;
122  }
123 
124  default: // pass escape sequence through
125  retval[i] = '\\';
126  retval[++i] = s[j];
127  break;
128  }
129  }
130  else
131  {
132  retval[i] = s[j];
133  }
134 
135  i++;
136  j++;
137  }
138 
139  retval.resize (i);
140 
141  return retval;
142 }
143 
144 static std::string
145 do_regexp_rep_string_escapes (const std::string& s)
146 {
147  std::string retval;
148 
149  std::size_t i = 0;
150  std::size_t j = 0;
151  std::size_t len = s.length ();
152 
153  retval.resize (len);
154 
155  while (j < len)
156  {
157  if (s[j] == '\\' && j+1 < len)
158  {
159  switch (s[++j])
160  {
161  case 'a': // alarm
162  retval[i] = '\a';
163  break;
164 
165  case 'b': // backspace
166  retval[i] = '\b';
167  break;
168 
169  case 'f': // formfeed
170  retval[i] = '\f';
171  break;
172 
173  case 'n': // newline
174  retval[i] = '\n';
175  break;
176 
177  case 'r': // carriage return
178  retval[i] = '\r';
179  break;
180 
181  case 't': // horizontal tab
182  retval[i] = '\t';
183  break;
184 
185  case 'v': // vertical tab
186  retval[i] = '\v';
187  break;
188 
189  case '0':
190  case '1':
191  case '2':
192  case '3':
193  case '4':
194  case '5':
195  case '6':
196  case '7': // octal input
197  {
198  std::size_t k;
199  int tmpi = s[j] - '0';
200  for (k = j+1; k < std::min (j+3, len); k++)
201  {
202  int digit = s[k] - '0';
203  if (digit < 0 || digit > 7)
204  break;
205  tmpi <<= 3;
206  tmpi += digit;
207  }
208  retval[i] = tmpi;
209  j = k - 1;
210  break;
211  }
212 
213  case 'o': // octal input
214  {
215  bool bad_esc_seq = (j+1 >= len);
216 
217  bool brace = false;
218  if (! bad_esc_seq && s[++j] == '{')
219  {
220  brace = true;
221  j++;
222  }
223 
224  int tmpi = 0;
225  std::size_t k;
226  for (k = j; k < std::min (j+3+brace, len); k++)
227  {
228  int digit = s[k] - '0';
229  if (digit < 0 || digit > 7)
230  break;
231  tmpi <<= 3;
232  tmpi += digit;
233  }
234  if (bad_esc_seq || (brace && s[k++] != '}'))
235  {
236  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
237  tmpi = 0;
238  }
239  retval[i] = tmpi;
240  j = k - 1;
241  break;
242  }
243 
244  case 'x': // hex input
245  {
246  bool bad_esc_seq = (j+1 >= len);
247 
248  bool brace = false;
249  if (! bad_esc_seq && s[++j] == '{')
250  {
251  brace = true;
252  j++;
253  }
254 
255  int tmpi = 0;
256  std::size_t k;
257  for (k = j; k < std::min (j+2+brace, len); k++)
258  {
259  if (! isxdigit (s[k]))
260  break;
261 
262  tmpi <<= 4;
263  int digit = s[k];
264  if (digit >= 'a')
265  tmpi += digit - 'a' + 10;
266  else if (digit >= 'A')
267  tmpi += digit - 'A' + 10;
268  else
269  tmpi += digit - '0';
270  }
271  if (bad_esc_seq || (brace && s[k++] != '}'))
272  {
273  warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
274  tmpi = 0;
275  }
276  retval[i] = tmpi;
277  j = k - 1;
278  break;
279  }
280 
281  // Both dollar sign (for capture buffer) and backslash are
282  // passed through with their escape backslash. The processing
283  // for these must occur during the actual replacement operation
284  // in lo-regexp.cc.
285  case '$': // pass dollar sign through with escape
286  retval[i] = '\\'; retval[++i] = '$';
287  break;
288 
289  case '\\': // pass backslash through with escape
290  retval[i] = '\\'; retval[++i] = '\\';
291  break;
292 
293  default: // convert escaped character to unescaped char
294  retval[i] = s[j];
295  break;
296  }
297  }
298  else
299  {
300  retval[i] = s[j];
301  }
302 
303  i++;
304  j++;
305  }
306 
307  retval.resize (i);
308 
309  return retval;
310 }
311 
312 static void
314  const std::string& who, int skip, bool& extra_args)
315 {
316  extra_args = false;
317 
318  for (int i = skip; i < args.length (); i++)
319  {
320  std::string str;
321 
322  str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());
323 
324  std::transform (str.begin (), str.end (), str.begin (), tolower);
325 
326  if (str.find ("once", 0) == 0)
327  options.once (true);
328  else if (str.find ("matchcase", 0) == 0)
329  options.case_insensitive (false);
330  else if (str.find ("ignorecase", 0) == 0)
331  options.case_insensitive (true);
332  else if (str.find ("dotall", 0) == 0)
333  options.dotexceptnewline (false);
334  else if (str.find ("stringanchors", 0) == 0)
335  options.lineanchors (false);
336  else if (str.find ("literalspacing", 0) == 0)
337  options.freespacing (false);
338  else if (str.find ("noemptymatch", 0) == 0)
339  options.emptymatch (false);
340  else if (str.find ("dotexceptnewline", 0) == 0)
341  options.dotexceptnewline (true);
342  else if (str.find ("lineanchors", 0) == 0)
343  options.lineanchors (true);
344  else if (str.find ("freespacing", 0) == 0)
345  options.freespacing (true);
346  else if (str.find ("emptymatch", 0) == 0)
347  options.emptymatch (true);
348  else if (str.find ("start", 0) == 0
349  || str.find ("end", 0) == 0
350  || str.find ("tokenextents", 0) == 0
351  || str.find ("match", 0) == 0
352  || str.find ("tokens", 0) == 0
353  || str.find ("names", 0) == 0
354  || str.find ("split", 0) == 0)
355  extra_args = true;
356  else
357  error ("%s: unrecognized option", who.c_str ());
358  }
359 }
360 
361 static octave_value_list
362 octregexp (const octave_value_list& args, int nargout,
363  const std::string& who, bool case_insensitive = false)
364 {
365  octave_value_list retval;
366 
367  int nargin = args.length ();
368 
369  // Make sure we have string, pattern
370  const std::string buffer = args(0).string_value ();
371 
372  std::string pattern = args(1).string_value ();
373 
374  // Rewrite pattern for PCRE
375  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
376 
377  regexp::opts options;
378  options.case_insensitive (case_insensitive);
379  bool extra_options = false;
380  parse_options (options, args, who, 2, extra_options);
381 
382  const regexp::match_data rx_lst
383  = regexp::match (pattern, buffer, options, who);
384 
385  string_vector named_pats = rx_lst.named_patterns ();
386 
387  std::size_t sz = rx_lst.size ();
388 
389  // Converted the linked list in the correct form for the return values
390 
391  octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);
392 
393  retval.resize (7);
394 
395  if (sz != 0)
396  {
397  for (int j = 0; j < named_pats.numel (); j++)
398  {
399  Cell ctmp (dim_vector (1, sz));
400  octave_idx_type i = 0;
401 
402  for (const auto& match_data : rx_lst)
403  {
404  string_vector named_tokens = match_data.named_tokens ();
405 
406  ctmp(i++) = named_tokens(j);
407  }
408 
409  nmap.assign (named_pats(j), ctmp);
410  }
411  }
412  retval(5) = nmap;
413 
414  if (options.once ())
415  {
416  auto p = rx_lst.begin ();
417 
418  retval(4) = (sz ? p->tokens () : Cell ());
419  retval(3) = (sz ? p->match_string () : "");
420  retval(2) = (sz ? p->token_extents () : Matrix ());
421 
422  if (sz)
423  {
424  double start = p->start ();
425  double end = p->end ();
426 
427  Cell split (dim_vector (1, 2));
428  split(0) = buffer.substr (0, start-1);
429  split(1) = buffer.substr (end);
430 
431  retval(6) = split;
432  retval(1) = end;
433  retval(0) = start;
434  }
435  else
436  {
437  retval(6) = buffer;
438  retval(1) = Matrix ();
439  retval(0) = Matrix ();
440  }
441  }
442  else
443  {
444  Cell tokens (dim_vector (1, sz));
445  Cell match_string (dim_vector (1, sz));
446  Cell token_extents (dim_vector (1, sz));
447  NDArray end (dim_vector (1, sz));
448  NDArray start (dim_vector (1, sz));
449  Cell split (dim_vector (1, sz+1));
450  std::size_t sp_start = 0;
451 
452  octave_idx_type i = 0;
453  for (const auto& match_data : rx_lst)
454  {
455  double s = match_data.start ();
456  double e = match_data.end ();
457 
458  string_vector tmp = match_data.tokens ();
459  tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
460  match_string(i) = match_data.match_string ();
461  token_extents(i) = match_data.token_extents ();
462  end(i) = e;
463  start(i) = s;
464  split(i) = buffer.substr (sp_start, s-sp_start-1);
465  sp_start = e;
466  i++;
467  }
468 
469  split(i) = buffer.substr (sp_start);
470 
471  retval(6) = split;
472  retval(4) = tokens;
473  retval(3) = match_string;
474  retval(2) = token_extents;
475  retval(1) = end;
476  retval(0) = start;
477  }
478 
479  // Alter the order of the output arguments
480 
481  if (extra_options)
482  {
483  int n = 0;
484  octave_value_list new_retval;
485  new_retval.resize (nargout);
486 
487  bool arg_used[7] {};
488 
489  for (int j = 2; j < nargin; j++)
490  {
491  int k = 0;
492  std::string str = args(j).string_value ();
493  std::transform (str.begin (), str.end (), str.begin (), tolower);
494 
495  if (str.find ("once", 0) == 0
496  || str.find ("stringanchors", 0) == 0
497  || str.find ("lineanchors", 0) == 0
498  || str.find ("matchcase", 0) == 0
499  || str.find ("ignorecase", 0) == 0
500  || str.find ("dotall", 0) == 0
501  || str.find ("dotexceptnewline", 0) == 0
502  || str.find ("literalspacing", 0) == 0
503  || str.find ("freespacing", 0) == 0
504  || str.find ("noemptymatch", 0) == 0
505  || str.find ("emptymatch", 0) == 0)
506  continue;
507  else if (str.find ("start", 0) == 0)
508  k = 0;
509  else if (str.find ("end", 0) == 0)
510  k = 1;
511  else if (str.find ("tokenextents", 0) == 0)
512  k = 2;
513  else if (str.find ("match", 0) == 0)
514  k = 3;
515  else if (str.find ("tokens", 0) == 0)
516  k = 4;
517  else if (str.find ("names", 0) == 0)
518  k = 5;
519  else if (str.find ("split", 0) == 0)
520  k = 6;
521 
522  new_retval(n++) = retval(k);
523  arg_used[k] = true;
524 
525  if (n == nargout)
526  break;
527  }
528 
529  // Fill in the rest of the arguments
530  if (n < nargout)
531  {
532  for (int j = 0; j < 7; j++)
533  {
534  if (! arg_used[j])
535  new_retval(n++) = retval(j);
536  }
537  }
538 
539  retval = new_retval;
540  }
541 
542  return retval;
543 }
544 
545 static octave_value_list
546 octcellregexp (const octave_value_list& args, int nargout,
547  const std::string& who, bool case_insensitive = false)
548 {
549  octave_value_list retval;
550 
551  if (args(0).iscell ())
552  {
553  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
554  octave_value_list new_args = args;
555  Cell cellstr = args(0).cell_value ();
556  if (args(1).iscell ())
557  {
558  Cell cellpat = args(1).cell_value ();
559 
560  if (cellpat.numel () == 1)
561  {
562  for (int j = 0; j < nargout; j++)
563  newretval[j].resize (cellstr.dims ());
564 
565  new_args(1) = cellpat(0);
566 
567  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
568  {
569  new_args(0) = cellstr(i);
570  octave_value_list tmp = octregexp (new_args, nargout, who,
571  case_insensitive);
572 
573  for (int j = 0; j < nargout; j++)
574  newretval[j](i) = tmp(j);
575  }
576  }
577  else if (cellstr.numel () == 1)
578  {
579  for (int j = 0; j < nargout; j++)
580  newretval[j].resize (cellpat.dims ());
581 
582  new_args(0) = cellstr(0);
583 
584  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
585  {
586  new_args(1) = cellpat(i);
587  octave_value_list tmp = octregexp (new_args, nargout, who,
588  case_insensitive);
589 
590  for (int j = 0; j < nargout; j++)
591  newretval[j](i) = tmp(j);
592  }
593  }
594  else if (cellstr.numel () == cellpat.numel ())
595  {
596  if (cellstr.dims () != cellpat.dims ())
597  error ("%s: inconsistent cell array dimensions", who.c_str ());
598 
599  for (int j = 0; j < nargout; j++)
600  newretval[j].resize (cellstr.dims ());
601 
602  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
603  {
604  new_args(0) = cellstr(i);
605  new_args(1) = cellpat(i);
606 
607  octave_value_list tmp = octregexp (new_args, nargout, who,
608  case_insensitive);
609 
610  for (int j = 0; j < nargout; j++)
611  newretval[j](i) = tmp(j);
612  }
613  }
614  else
615  error ("regexp: cell array arguments must be scalar or equal size");
616  }
617  else
618  {
619  for (int j = 0; j < nargout; j++)
620  newretval[j].resize (cellstr.dims ());
621 
622  for (octave_idx_type i = 0; i < cellstr.numel (); i++)
623  {
624  new_args(0) = cellstr(i);
625  octave_value_list tmp = octregexp (new_args, nargout, who,
626  case_insensitive);
627 
628  for (int j = 0; j < nargout; j++)
629  newretval[j](i) = tmp(j);
630  }
631  }
632 
633  for (int j = 0; j < nargout; j++)
634  retval(j) = octave_value (newretval[j]);
635  }
636  else if (args(1).iscell ())
637  {
638  OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
639  octave_value_list new_args = args;
640  Cell cellpat = args(1).cell_value ();
641 
642  for (int j = 0; j < nargout; j++)
643  newretval[j].resize (cellpat.dims ());
644 
645  for (octave_idx_type i = 0; i < cellpat.numel (); i++)
646  {
647  new_args(1) = cellpat(i);
648  octave_value_list tmp = octregexp (new_args, nargout, who,
649  case_insensitive);
650 
651  for (int j = 0; j < nargout; j++)
652  newretval[j](i) = tmp(j);
653  }
654 
655  for (int j = 0; j < nargout; j++)
656  retval(j) = octave_value (newretval[j]);
657  }
658  else
659  retval = octregexp (args, nargout, who, case_insensitive);
660 
661  return retval;
662 
663 }
664 
665 DEFUN (regexp, args, nargout,
666  doc: /* -*- texinfo -*-
667 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
668 @deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
669 Regular expression string matching.
670 
671 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
672 substrings of any matches, or empty values if there are none.
673 
674 The matched pattern @var{pat} can include any of the standard regex
675 operators, including:
676 
677 @table @code
678 @item .
679 Match any character
680 
681 @item * + ? @{@}
682 Repetition operators, representing
683 
684 @table @code
685 @item *
686 Match zero or more times
687 
688 @item +
689 Match one or more times
690 
691 @item ?
692 Match zero or one times
693 
694 @item @{@var{n}@}
695 Match exactly @var{n} times
696 
697 @item @{@var{n},@}
698 Match @var{n} or more times
699 
700 @item @{@var{m},@var{n}@}
701 Match between @var{m} and @var{n} times
702 @end table
703 
704 @item [@dots{}] [^@dots{}]
705 
706 List operators. The pattern will match any character listed between
707 @qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the
708 pattern is inverted and any character except those listed between brackets
709 will match.
710 
711 Escape sequences defined below can also be used inside list operators. For
712 example, a template for a floating point number might be @code{[-+.\d]+}.
713 
714 @item () (?:)
715 Grouping operator. The first form, parentheses only, also creates a token.
716 
717 @item |
718 Alternation operator. Match one of a choice of regular expressions. The
719 alternatives must be delimited by the grouping operator @code{()} above.
720 
721 @item ^ $
722 Anchoring operators. Requires pattern to occur at the start (@code{^}) or
723 end (@code{$}) of the string.
724 @end table
725 
726 In addition, the following escaped characters have special meaning.
727 
728 @table @code
729 
730 @item \d
731 Match any digit
732 
733 @item \D
734 Match any non-digit
735 
736 @item \s
737 Match any whitespace character
738 
739 @item \S
740 Match any non-whitespace character
741 
742 @item \w
743 Match any word character
744 
745 @item \W
746 Match any non-word character
747 
748 @item <
749 Match the beginning of a word
750 
751 @item >
752 Match the end of a word
753 
754 @item \B
755 Match within a word
756 @end table
757 
758 Implementation Note: For compatibility with @sc{matlab}, escape sequences
759 in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
760 even when @var{pat} has been defined with single quotes. To disable
761 expansion use a second backslash before the escape sequence (e.g.,
762 "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
763 function.
764 
765 The outputs of @code{regexp} default to the order given below
766 
767 @table @var
768 @item s
769 The start indices of each matching substring
770 
771 @item e
772 The end indices of each matching substring
773 
774 @item te
775 The extents of each matched token surrounded by @code{(@dots{})} in
776 @var{pat}
777 
778 @item m
779 A cell array of the text of each match
780 
781 @item t
782 A cell array of the text of each token matched
783 
784 @item nm
785 A structure containing the text of each matched named token, with the name
786 being used as the fieldname. A named token is denoted by
787 @code{(?<name>@dots{})}.
788 
789 @item sp
790 A cell array of the text not returned by match, i.e., what remains if you
791 split the string based on @var{pat}.
792 @end table
793 
794 Particular output arguments, or the order of the output arguments, can be
795 selected by additional @var{opt} arguments. These are strings and the
796 correspondence between the output arguments and the optional argument
797 are
798 
799 @multitable @columnfractions 0.2 0.3 0.3 0.2
800 @item @tab @qcode{'start'} @tab @var{s} @tab
801 @item @tab @qcode{'end'} @tab @var{e} @tab
802 @item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
803 @item @tab @qcode{'match'} @tab @var{m} @tab
804 @item @tab @qcode{'tokens'} @tab @var{t} @tab
805 @item @tab @qcode{'names'} @tab @var{nm} @tab
806 @item @tab @qcode{'split'} @tab @var{sp} @tab
807 @end multitable
808 
809 Additional arguments are summarized below.
810 
811 @table @samp
812 @item once
813 Return only the first occurrence of the pattern.
814 
815 @item matchcase
816 Make the matching case sensitive. (default)
817 
818 Alternatively, use (?-i) in the pattern.
819 
820 @item ignorecase
821 Ignore case when matching the pattern to the string.
822 
823 Alternatively, use (?i) in the pattern.
824 
825 @item stringanchors
826 Match the anchor characters at the beginning and end of the string.
827 (default)
828 
829 Alternatively, use (?-m) in the pattern.
830 
831 @item lineanchors
832 Match the anchor characters at the beginning and end of the line.
833 
834 Alternatively, use (?m) in the pattern.
835 
836 @item dotall
837 The pattern @code{.} matches all characters including the newline character.
838  (default)
839 
840 Alternatively, use (?s) in the pattern.
841 
842 @item dotexceptnewline
843 The pattern @code{.} matches all characters except the newline character.
844 
845 Alternatively, use (?-s) in the pattern.
846 
847 @item literalspacing
848 All characters in the pattern, including whitespace, are significant and are
849 used in pattern matching. (default)
850 
851 Alternatively, use (?-x) in the pattern.
852 
853 @item freespacing
854 The pattern may include arbitrary whitespace and also comments beginning
855 with the character @samp{#}.
856 
857 Alternatively, use (?x) in the pattern.
858 
859 @item noemptymatch
860 Zero-length matches are not returned. (default)
861 
862 @item emptymatch
863 Return zero-length matches.
864 
865 @code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
866 are zero or more @qcode{'b'} characters at positions 1 and end-of-string.
867 
868 @end table
869 
870 Stack Limitation Note: Pattern searches are done with a recursive function
871 which can overflow the program stack when there are a high number of matches.
872 For example,
873 
874 @example
875 @code{regexp (repmat ('a', 1, 1e5), '(a)+')}
876 @end example
877 
878 @noindent
879 may lead to a segfault. As an alternative, consider constructing pattern
880 searches that reduce the number of matches (e.g., by creatively using set
881 complement), and then further processing the return variables (now reduced in
882 size) with successive @code{regexp} searches.
883 @seealso{regexpi, strfind, regexprep}
884 @end deftypefn */)
885 {
886  if (args.length () < 2)
887  print_usage ();
888 
889  octave_value_list retval;
890 
891  if (args(0).iscell () || args(1).iscell ())
892  retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
893  else
894  retval = octregexp (args, nargout, "regexp");
895 
896  return retval;
897 }
898 
899 /*
900 ## PCRE_ERROR_MATCHLIMIT test
901 %!test
902 %! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
903 %! ws = warning ("query");
904 %! unwind_protect
905 %! warning ("off");
906 %! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
907 %! unwind_protect_cleanup
908 %! warning (ws);
909 %! end_unwind_protect
910 
911 ## segfault test
912 %!assert (regexp ("abcde", "."), [1,2,3,4,5])
913 %!assert <*62704> (regexpi('(', '\‍(?'), 1)
914 ## Infinite loop test
915 %!assert (isempty (regexp ("abcde", "")))
916 
917 ## Check that anchoring of pattern works correctly
918 %!assert (regexp ('abcabc', '^abc'), 1)
919 %!assert (regexp ('abcabc', 'abc$'), 4)
920 %!assert (regexp ('abcabc', '^abc$'), zeros (1,0))
921 
922 ## UTF-8 test with character vector "âé🙂ïõù"
923 %!assert (regexp (char ([195, 162, 195, 169, 240, 159, 153, 130, 195, 175, ...
924 %! 195, 181, 195, 185]), "."), [1, 3, 5, 9, 11, 13])
925 
926 %!test
927 %! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
928 %! assert (s, zeros (1,0));
929 %! assert (e, zeros (1,0));
930 %! assert (te, cell (1,0));
931 %! assert (m, cell (1,0));
932 %! assert (t, cell (1,0));
933 
934 %!test
935 %! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
936 %! assert (s, zeros (1,0));
937 %! assert (e, zeros (1,0));
938 %! assert (te, cell (1,0));
939 %! assert (m, cell (1,0));
940 %! assert (t, cell (1,0));
941 
942 %!test
943 %! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
944 %! assert (s, 2);
945 %! assert (e, 10);
946 %! assert (te{1}, [3, 7]);
947 %! assert (m{1}, 'firetruck');
948 %! assert (t{1}{1}, 'iretr');
949 
950 %!test
951 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
952 %! assert (s, [1, 12]);
953 %! assert (e, [5, 17]);
954 %! assert (size (te), [1, 2]);
955 %! assert (isempty (te{1}));
956 %! assert (isempty (te{2}));
957 %! assert (m{1}, 'short');
958 %! assert (m{2}, 'string');
959 %! assert (size (t), [1, 2]);
960 %! assert (isempty (t{1}));
961 %! assert (isempty (t{2}));
962 
963 %!test
964 %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
965 %! assert (s, 1);
966 %! assert (e, 5);
967 %! assert (isempty (te));
968 %! assert (m, 'short');
969 %! assert (isempty (t));
970 
971 %!test
972 %! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
973 %! assert (s, 1);
974 %! assert (e, 5);
975 %! assert (isempty (te));
976 %! assert (m, 'short');
977 %! assert (isempty (t));
978 
979 %!test
980 %! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
981 %! assert (s, 1);
982 %! assert (e, 10);
983 %! assert (size (te), [1, 1]);
984 %! assert (te{1}, [1,5; 7,10]);
985 %! assert (m{1}, 'short test');
986 %! assert (size (t), [1, 1]);
987 %! assert (t{1}{1}, 'short');
988 %! assert (t{1}{2}, 'test');
989 %! assert (size (nm), [1, 1]);
990 %! assert (! isempty (fieldnames (nm)));
991 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
992 %! assert (nm.word1, 'short');
993 %! assert (nm.word2, 'test');
994 
995 %!test
996 %! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
997 %! assert (s, 1);
998 %! assert (e, 10);
999 %! assert (size (te), [1, 1]);
1000 %! assert (te{1}, [1,5; 7,10]);
1001 %! assert (m{1}, 'short test');
1002 %! assert (size (t), [1, 1]);
1003 %! assert (t{1}{1}, 'short');
1004 %! assert (t{1}{2}, 'test');
1005 %! assert (size (nm), [1, 1]);
1006 %! assert (! isempty (fieldnames (nm)));
1007 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1008 %! assert (nm.word1, 'short');
1009 %! assert (nm.word2, 'test');
1010 
1011 %!test
1012 %! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
1013 %! assert (size (t), [1, 2]);
1014 %! assert (t{1}{1}, "John");
1015 %! assert (t{1}{2}, "Davis");
1016 %! assert (t{2}{1}, "Rogers");
1017 %! assert (t{2}{2}, "James");
1018 %! assert (size (nm), [1, 2]);
1019 %! assert (nm(1).first, "John");
1020 %! assert (nm(1).last, "Davis");
1021 %! assert (nm(2).first, "James");
1022 %! assert (nm(2).last, "Rogers");
1023 
1024 ## Tests for nulls in strings properly matching
1025 %!test
1026 %! str = "A\0B\0\0C";
1027 %! ptn = '(\0+)'; # also test null in single-quote pattern
1028 %! M = regexp (str, ptn, "match");
1029 %! assert (size (M), [1, 2]);
1030 %! assert (double (M{1}), [0]);
1031 %! assert (double (M{2}), [0, 0]);
1032 
1033 %!test
1034 %! str = "A\0B\0\0C";
1035 %! ptn = "(\0+)"; # also test null in double-quote pattern
1036 %! T = regexp (str, ptn, "tokens");
1037 %! assert (size (T), [1, 2]);
1038 %! assert (double (T{1}{1}), [0]);
1039 %! assert (double (T{2}{1}), [0, 0]);
1040 
1041 %!test
1042 %! str = "A\0B\0\0C";
1043 %! ptn = '(?<namedtoken>\0+)';
1044 %! NT = regexp (str, ptn, "names");
1045 %! assert (size (NT), [1, 2]);
1046 %! assert (double (NT(1).namedtoken), [0]);
1047 %! assert (double (NT(2).namedtoken), [0, 0]);
1048 
1049 ## Tests for named tokens
1050 %!test
1051 %! ## Parenthesis in named token (ie (int)) causes a problem
1052 %! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
1053 %! struct ('typestr', 'int'));
1054 
1055 %!test <*35683>
1056 %! ## Mix of named and unnamed tokens can cause segfault
1057 %! str = "abcde";
1058 %! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
1059 %! tokens = regexp (str, ptn, "names");
1060 %! assert (isstruct (tokens) && numel (tokens) == 1);
1061 %! assert (tokens.T1, "a");
1062 %! assert (tokens.T2, "de");
1063 
1064 ## Test options to regexp
1065 %!assert (regexp ("abc\nabc", '.'), [1:7])
1066 %!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
1067 %!test
1068 %! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
1069 %! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1070 %! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1071 
1072 %!assert (regexp ("caseCaSe", 'case'), 1)
1073 %!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
1074 %!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
1075 %!test
1076 %! assert (regexp ("caseCaSe", '(?-i)case'), 1);
1077 %! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);
1078 
1079 %!assert (regexp ("abc\nabc", 'c$'), 7)
1080 %!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
1081 %!test
1082 %! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
1083 %! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
1084 %! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);
1085 
1086 %!assert (regexp ("this word", 's w'), 4)
1087 %!assert (regexp ("this word", 's w', 'literalspacing'), 4)
1088 %!test
1089 %! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
1090 %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
1091 %! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
1092 
1093 %!test
1094 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
1095 %! assert (s, [1 5]);
1096 %! assert (e, [3 5]);
1097 %! assert (te, { zeros(0,2), zeros(0,2) });
1098 %! assert (m, { "OCT", "V" });
1099 %! assert (t, { cell(1,0), cell(1,0) });
1100 %! assert (isempty (fieldnames (nm)));
1101 %! assert (sp, { "", "A", "E" });
1102 
1103 %!test
1104 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
1105 %! assert (s, [1 5]);
1106 %! assert (e, [3 5]);
1107 %! assert (te, { [1 3], [5 5] });
1108 %! assert (m, { "OCT", "V" });
1109 %! assert (t, { {"OCT"}, {"V"} });
1110 %! assert (isempty (fieldnames (nm)));
1111 %! assert (sp, { "", "A", "E" });
1112 
1113 %!test
1114 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
1115 %! assert (s, [1 4 5 6 7]);
1116 %! assert (e, [3 3 5 5 6]);
1117 %! assert (te, repmat ({zeros(0,2)}, [1, 5]));
1118 %! assert (m, { "OCT", "", "V", "", "" });
1119 %! assert (t, repmat({cell(1,0)}, [1, 5]));
1120 %! assert (isempty (fieldnames (nm)));
1121 %! assert (sp, { "", "", "A", "", "E", "" });
1122 
1123 %!test
1124 %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
1125 %! assert (s, [1 4 5 6 7]);
1126 %! assert (e, [3 3 5 5 6]);
1127 %! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
1128 %! assert (m, { "OCT", "", "V", "", "" });
1129 %! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
1130 %! assert (isempty (fieldnames (nm)));
1131 %! assert (sp, { "", "", "A", "", "E", "" });
1132 
1133 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1134 %! {6;[1,5,9];zeros(1,0)})
1135 %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1136 %! {6;[3,7];[1,9]})
1137 %!assert (regexp ('Strings', {'t','s'}), {2, 7})
1138 
1139 ## Test case for lookaround operators
1140 %!test
1141 %! assert (regexp ('Iraq', 'q(?!u)'), 4);
1142 %! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
1143 %! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
1144 %! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
1145 %! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0));
1146 %! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'});
1147 %! assert (regexp ('thingamabob', '(?<=a)b'), 9);
1148 
1149 ## Tests for split option.
1150 %!shared str
1151 %! str = "foo bar foo";
1152 %!test
1153 %! [a, b] = regexp (str, "f..", "match", "split");
1154 %! assert (a, {"foo", "foo"});
1155 %! assert (b, {"", " bar ", ""});
1156 %!test
1157 %! [a, b] = regexp (str, "f..", "match", "split", "once");
1158 %! assert (a, "foo");
1159 %! assert (b, {"", " bar foo"});
1160 %!test
1161 %! [a, b] = regexp (str, "fx.", "match", "split");
1162 %! assert (a, cell (1, 0));
1163 %! assert (b, {"foo bar foo"});
1164 %!test
1165 %! [a, b] = regexp (str, "fx.", "match", "split", "once");
1166 %! assert (a, "");;
1167 %! assert (b, "foo bar foo");
1168 
1169 %!shared str
1170 %! str = "foo bar";
1171 %!test
1172 %! [a, b] = regexp (str, "f..", "match", "split");
1173 %! assert (a, {"foo"});
1174 %! assert (b, {"", " bar"});
1175 %!test
1176 %! [a, b] = regexp (str, "b..", "match", "split");
1177 %! assert (a, {"bar"});
1178 %! assert (b, {"foo ", ""});
1179 %!test
1180 %! [a, b] = regexp (str, "x", "match", "split");
1181 %! assert (a, cell (1, 0));
1182 %! assert (b, {"foo bar"});
1183 %!test
1184 %! [a, b] = regexp (str, "[o]+", "match", "split");
1185 %! assert (a, {"oo"});
1186 %! assert (b, {"f", " bar"});
1187 
1188 ## Test escape sequences are expanded even in single-quoted strings
1189 %!assert (regexp ("\n", '\n'), 1)
1190 %!assert (regexp ("\n", "\n"), 1)
1191 
1192 ## Test escape sequences are silently converted
1193 %!test <*45407>
1194 %! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
1195 %! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
1196 %! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
1197 
1198 ## Test start-of-word / end-of-word patterns for Matlab compatibility
1199 %!test <*59992>
1200 %! assert (regexp ('foo!+bar', '<\w'), [1, 6]);
1201 %! assert (regexp ('foo!+bar', '.>'), [3, 4, 8]);
1202 %! assert (regexp ('foo!+bar\nbar!+foo', '.>'), [3, 4, 8, 13, 14, 18]);
1203 %! assert (regexp ('foo!+bar\nbar!+foo', '<\w'), [1, 6, 10, 16]);
1204 
1205 ## Test "incomplete" named patterns
1206 %!assert <*62705> (regexpi ('<', '\‍(?<'), 1)
1207 %!assert <*62705> (regexpi ('<n>', '\‍(?<n>'), 1)
1208 %!assert <*62705> (regexpi ('<n>', '\‍(?<n>\‍)?'), 1)
1209 %!assert <62705> (regexpi ('<n>a', '\‍(?<n>a\‍)?'), 1)
1210 
1211 ## Test input validation
1212 %!error regexp ('string', 'tri', 'BadArg')
1213 %!error regexp ('string')
1214 
1215 */
1216 
1217 DEFUN (regexpi, args, nargout,
1218  doc: /* -*- texinfo -*-
1219 @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
1220 @deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})
1221 
1222 Case insensitive regular expression string matching.
1223 
1224 Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
1225 substrings of any matches, or empty values if there are none.
1226 @xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search
1227 pattern.
1228 @seealso{regexp}
1229 @end deftypefn */)
1230 {
1231  if (args.length () < 2)
1232  print_usage ();
1233 
1234  if (args(0).iscell () || args(1).iscell ())
1235  return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
1236  else
1237  return octregexp (args, nargout, "regexpi", true);
1238 }
1239 
1240 /*
1241 ## segfault test
1242 %!assert (regexpi ("abcde", "."), [1,2,3,4,5])
1243 
1244 ## Check that anchoring of pattern works correctly
1245 %!assert (regexpi ('abcabc', '^ABC'), 1)
1246 %!assert (regexpi ('abcabc', 'ABC$'), 4)
1247 %!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))
1248 
1249 %!test
1250 %! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
1251 %! assert (s, zeros (1,0));
1252 %! assert (e, zeros (1,0));
1253 %! assert (te, cell (1,0));
1254 %! assert (m, cell (1,0));
1255 %! assert (t, cell (1,0));
1256 
1257 %!test
1258 %! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
1259 %! assert (s, 2);
1260 %! assert (e, 10);
1261 %! assert (te{1}, [3, 7]);
1262 %! assert (m{1}, 'FiRetrUck');
1263 %! assert (t{1}{1}, 'iRetr');
1264 
1265 %!test
1266 %! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
1267 %! assert (s, 2);
1268 %! assert (e, 10);
1269 %! assert (te{1}, [3, 7]);
1270 %! assert (m{1}, 'firetruck');
1271 %! assert (t{1}{1}, 'iretr');
1272 
1273 %!test
1274 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
1275 %! assert (s, [1, 12]);
1276 %! assert (e, [5, 17]);
1277 %! assert (size (te), [1, 2]);
1278 %! assert (isempty (te{1}));
1279 %! assert (isempty (te{2}));
1280 %! assert (m{1}, 'ShoRt');
1281 %! assert (m{2}, 'String');
1282 %! assert (size (t), [1, 2]);
1283 %! assert (isempty (t{1}));
1284 %! assert (isempty (t{2}));
1285 
1286 %!test
1287 %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
1288 %! assert (s, 1);
1289 %! assert (e, 5);
1290 %! assert (isempty (te));
1291 %! assert (m, 'ShoRt');
1292 %! assert (isempty (t));
1293 
1294 %!test
1295 %! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1296 %! assert (s, 1);
1297 %! assert (e, 5);
1298 %! assert (isempty (te));
1299 %! assert (m, 'ShoRt');
1300 %! assert (isempty (t));
1301 
1302 %!test
1303 %! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
1304 %! assert (s, 1);
1305 %! assert (e, 10);
1306 %! assert (size (te), [1, 1]);
1307 %! assert (te{1}, [1,5; 7,10]);
1308 %! assert (m{1}, 'ShoRt Test');
1309 %! assert (size (t), [1, 1]);
1310 %! assert (t{1}{1}, 'ShoRt');
1311 %! assert (t{1}{2}, 'Test');
1312 %! assert (size (nm), [1, 1]);
1313 %! assert (! isempty (fieldnames (nm)));
1314 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1315 %! assert (nm.word1, 'ShoRt');
1316 %! assert (nm.word2, 'Test');
1317 
1318 %!test
1319 %! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1320 %! assert (s, 1);
1321 %! assert (e, 10);
1322 %! assert (size (te), [1, 1]);
1323 %! assert (te{1}, [1,5; 7,10]);
1324 %! assert (m{1}, 'ShoRt Test');
1325 %! assert (size (t), [1, 1]);
1326 %! assert (t{1}{1}, 'ShoRt');
1327 %! assert (t{1}{2}, 'Test');
1328 %! assert (size (nm), [1, 1]);
1329 %! assert (! isempty (fieldnames (nm)));
1330 %! assert (sort (fieldnames (nm)), {'word1';'word2'});
1331 %! assert (nm.word1, 'ShoRt');
1332 %! assert (nm.word2, 'Test');
1333 
1334 %!assert (regexpi ("abc\nabc", '.'), [1:7])
1335 %!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
1336 %!test
1337 %! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
1338 %! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
1339 %! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);
1340 
1341 %!assert (regexpi ("caseCaSe", 'case'), [1, 5])
1342 %!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
1343 %!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
1344 %!test
1345 %! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
1346 %! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);
1347 
1348 %!assert (regexpi ("abc\nabc", 'C$'), 7)
1349 %!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
1350 %!test
1351 %! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
1352 %! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
1353 %! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);
1354 
1355 %!assert (regexpi ("this word", 'S w'), 4)
1356 %!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
1357 %!test
1358 %! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
1359 %! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
1360 %! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));
1361 
1362 %!error regexpi ('string', 'tri', 'BadArg')
1363 %!error regexpi ('string')
1364 
1365 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
1366 %! {6;[1,5,9];zeros(1, 0)})
1367 %!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'),
1368 %! {6, [1,5,9], zeros(1,0)})
1369 %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
1370 %! {6;[3,7];[1,9]})
1371 %!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})
1372 
1373 %!assert (regexpi ("\n", '\n'), 1)
1374 %!assert (regexpi ("\n", "\n"), 1)
1375 */
1376 
1377 static octave_value
1378 octregexprep (const octave_value_list& args, const std::string& who)
1379 {
1380  int nargin = args.length ();
1381 
1382  // Make sure we have string, pattern, replacement
1383  const std::string buffer = args(0).string_value ();
1384 
1385  std::string pattern = args(1).string_value ();
1386 
1387  // Rewrite pattern for PCRE
1388  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());
1389 
1390  std::string replacement = args(2).string_value ();
1391 
1392  // Matlab compatibility.
1393  if (args(2).is_sq_string ())
1394  replacement = do_regexp_rep_string_escapes (replacement);
1395 
1396  // Pack options excluding 'tokenize' and various output
1397  // reordering strings into regexp arg list
1398  octave_value_list regexpargs (nargin-3, octave_value ());
1399 
1400  int len = 0;
1401  for (int i = 3; i < nargin; i++)
1402  {
1403  const std::string opt = args(i).string_value ();
1404  if (opt != "tokenize" && opt != "start" && opt != "end"
1405  && opt != "tokenextents" && opt != "match" && opt != "tokens"
1406  && opt != "names" && opt != "split" && opt != "warnings")
1407  {
1408  regexpargs(len++) = args(i);
1409  }
1410  }
1411  regexpargs.resize (len);
1412 
1413  regexp::opts options;
1414  bool extra_args = false;
1415  parse_options (options, regexpargs, who, 0, extra_args);
1416 
1417  return regexp::replace (pattern, buffer, replacement, options, who);
1418 }
1419 
1420 DEFUN (regexprep, args, ,
1421  doc: /* -*- texinfo -*-
1422 @deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
1423 @deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
1424 Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.
1425 
1426 The pattern is a regular expression as documented for @code{regexp}.
1427 @xref{XREFregexp,,@code{regexp}}.
1428 
1429 All strings must be UTF-8 encoded.
1430 
1431 The replacement string may contain @code{$i}, which substitutes for the ith
1432 set of parentheses in the match string. For example,
1433 
1434 @example
1435 regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
1436 @end example
1437 
1438 @noindent
1439 returns @qcode{"Dunn, Bill"}
1440 
1441 Options in addition to those of @code{regexp} are
1442 
1443 @table @samp
1444 
1445 @item once
1446 Replace only the first occurrence of @var{pat} in the result.
1447 
1448 @item warnings
1449 This option is present for compatibility but is ignored.
1450 
1451 @end table
1452 
1453 Implementation Note: For compatibility with @sc{matlab}, escape sequences
1454 in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
1455 even when @var{pat} has been defined with single quotes. To disable
1456 expansion use a second backslash before the escape sequence (e.g.,
1457 "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
1458 function.
1459 @seealso{regexp, regexpi, strrep}
1460 @end deftypefn */)
1461 {
1462  if (args.length () < 3)
1463  print_usage ();
1464 
1465  octave_value_list retval;
1466 
1467  if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
1468  {
1469  Cell str, pat, rep;
1470  dim_vector dv0;
1471  dim_vector dv1 (1, 1);
1472 
1473  if (args(0).iscell ())
1474  str = args(0).cell_value ();
1475  else
1476  str = Cell (args(0));
1477 
1478  if (args(1).iscell ())
1479  pat = args(1).cell_value ();
1480  else
1481  pat = Cell (args(1));
1482 
1483  if (args(2).iscell ())
1484  rep = args(2).cell_value ();
1485  else
1486  rep = Cell (args(2));
1487 
1488  dv0 = str.dims ();
1489  if (pat.numel () != 1)
1490  {
1491  dv1 = pat.dims ();
1492  if (rep.numel () != 1 && dv1 != rep.dims ())
1493  error ("regexprep: inconsistent cell array dimensions");
1494  }
1495  else if (rep.numel () != 1)
1496  dv1 = rep.dims ();
1497 
1498  Cell ret (dv0);
1499  octave_value_list new_args = args;
1500 
1501  for (octave_idx_type i = 0; i < dv0.numel (); i++)
1502  {
1503  new_args(0) = str(i);
1504  if (pat.numel () == 1)
1505  new_args(1) = pat(0);
1506  if (rep.numel () == 1)
1507  new_args(2) = rep(0);
1508 
1509  for (octave_idx_type j = 0; j < dv1.numel (); j++)
1510  {
1511  if (pat.numel () != 1)
1512  new_args(1) = pat(j);
1513  if (rep.numel () != 1)
1514  new_args(2) = rep(j);
1515  new_args(0) = octregexprep (new_args, "regexprep");
1516  }
1517 
1518  ret(i) = new_args(0);
1519  }
1520 
1521  retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
1522  }
1523  else
1524  retval = octregexprep (args, "regexprep");
1525 
1526  return retval;
1527 }
1528 
1529 /*
1530 %!test # Replace with empty
1531 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1532 %! t = regexprep (xml, '<[!?][^>]*>', '');
1533 %! assert (t, ' <tag v="hello">some stuff</tag>');
1534 
1535 %!test # Replace with non-empty
1536 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1537 %! t = regexprep (xml, '<[!?][^>]*>', '?');
1538 %! assert (t, '? <tag v="hello">some stuff?</tag>');
1539 
1540 %!test # Check that 'tokenize' is ignored
1541 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
1542 %! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
1543 %! assert (t, ' <tag v="hello">some stuff</tag>');
1544 
1545 ## Test capture replacement
1546 %!test
1547 %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
1548 %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
1549 %! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
1550 %! assert (t, result);
1551 
1552 ## Return the original if no match
1553 %!assert (regexprep ('hello', 'world', 'earth'), 'hello')
1554 
1555 ## Test emptymatch option
1556 %!assert (regexprep ('World', '^', 'Hello '), 'World')
1557 %!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
1558 
1559 ## Test a general replacement
1560 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")
1561 
1562 ## Make sure replacements work at the beginning and end of string
1563 %!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
1564 %!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")
1565 
1566 ## Test options "once" and "ignorecase"
1567 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
1568 %! "a_b]c{d}e-f=g")
1569 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
1570 %! "a_b_c_d_e_f_g")
1571 
1572 ## Option combinations
1573 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
1574 %! "a_b]c{d}e-f=g")
1575 
1576 ## End conditions on replacement
1577 %!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
1578 %!assert (regexprep ("abc", "(b)", "$1"), "abc")
1579 %!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
1580 %!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")
1581 
1582 ## Test cell array arguments
1583 %!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
1584 %!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
1585 %!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})
1586 
1587 ## Nasty lookbehind expression
1588 %!test
1589 %! warning ("off", "Octave:regexp-lookbehind-limit", "local");
1590 %! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\‍(\-[1-9]*\‍)',
1591 %! '_minus1'),'x^(-1)+y_minus1+z_minus1=0');
1592 
1593 ## Verify escape sequences in pattern
1594 %!assert (regexprep ("\n", '\n', "X"), "X")
1595 %!assert (regexprep ("\n", "\n", "X"), "X")
1596 
1597 ## Verify NULLs in pattern and replacement string
1598 %!assert (regexprep ("A\0A", "\0", ","), "A,A")
1599 %!assert (regexprep ("A\0A", '\0', ","), "A,A")
1600 %!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
1601 %!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")
1602 
1603 ## Empty matches were broken on ARM architecture
1604 %!test <*52810>
1605 %! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"),
1606 %! "\nabc"));
1607 */
1608 
OCTAVE_END_NAMESPACE(octave)
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
OCTARRAY_OVERRIDABLE_FUNC_API octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:414
OCTARRAY_OVERRIDABLE_FUNC_API const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:503
Definition: Cell.h:43
std::size_t size(void) const
Definition: base-list.h:52
iterator begin(void)
Definition: base-list.h:65
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
octave_idx_type numel(int n=0) const
Number of elements that a matrix with this dimensions would have.
Definition: dim-vector.h:335
void assign(const std::string &k, const Cell &val)
Definition: oct-map.h:365
Cell cell_value(void) const
Definition: ovl.h:105
void resize(octave_idx_type n, const octave_value &rfv=octave_value())
Definition: ovl.h:117
octave_idx_type length(void) const
Definition: ovl.h:113
string_vector named_patterns(void) const
Definition: lo-regexp.h:217
void dotexceptnewline(bool val)
Definition: lo-regexp.h:140
void lineanchors(bool val)
Definition: lo-regexp.h:143
void case_insensitive(bool val)
Definition: lo-regexp.h:139
void freespacing(bool val)
Definition: lo-regexp.h:142
void emptymatch(bool val)
Definition: lo-regexp.h:141
void once(bool val)
Definition: lo-regexp.h:144
std::string replace(const std::string &buffer, const std::string &replacement) const
Definition: lo-regexp.cc:608
match_data match(const std::string &buffer) const
Definition: lo-regexp.cc:328
octave_idx_type numel(void) const
Definition: str-vec.h:100
OCTAVE_BEGIN_NAMESPACE(octave) static octave_value daspk_fcn
OCTINTERP_API void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void warning(const char *fmt,...)
Definition: error.cc:1054
void error(const char *fmt,...)
Definition: error.cc:979
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5819
class OCTAVE_API Matrix
Definition: mx-fwd.h:31
octave_idx_type n
Definition: mx-inlines.cc:753
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value_list ovl(const OV_Args &... args)
Construct an octave_value_list with less typing.
Definition: ovl.h:211
static octave_value_list octregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:362
static void parse_options(regexp::opts &options, const octave_value_list &args, const std::string &who, int skip, bool &extra_args)
Definition: regexp.cc:313
static octave_value octregexprep(const octave_value_list &args, const std::string &who)
Definition: regexp.cc:1378
static std::string do_regexp_rep_string_escapes(const std::string &s)
Definition: regexp.cc:145
static std::string do_regexp_ptn_string_escapes(const std::string &s, bool is_sq_str)
Definition: regexp.cc:55
static octave_value_list octcellregexp(const octave_value_list &args, int nargout, const std::string &who, bool case_insensitive=false)
Definition: regexp.cc:546
F77_RET_T len
Definition: xerbla.cc:61