GNU Octave  9.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
strfind.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2009-2024 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <algorithm>
31 #include <deque>
32 #include <limits>
33 #include <string>
34 
35 #include "oct-locbuf.h"
36 
37 #include "Cell.h"
38 #include "builtin-defun-decls.h"
39 #include "defun.h"
40 #include "errwarn.h"
41 #include "error.h"
42 #include "ov.h"
43 #include "unwind-prot.h"
44 #include "utils.h"
45 
47 
48 // This allows safe indexing with char.
49 // In C++, char may be (and often is) signed!
50 #define ORD(ch) static_cast<unsigned char> (ch)
51 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
52 
53 // This is the quick search algorithm, as described at
54 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
55 static void
56 qs_preprocess (const Array<char>& needle,
57  octave_idx_type *table)
58 {
59  const char *x = needle.data ();
60  octave_idx_type m = needle.numel ();
61 
62  for (octave_idx_type i = 0; i < TABSIZE; i++)
63  table[i] = m + 1;
64  for (octave_idx_type i = 0; i < m; i++)
65  table[ORD(x[i])] = m - i;
66 }
67 
69 qs_search (const Array<char>& needle,
70  const Array<char>& haystack,
71  const octave_idx_type *table,
72  bool overlaps = true)
73 {
74  const char *x = needle.data ();
75  octave_idx_type m = needle.numel ();
76  const char *y = haystack.data ();
77  octave_idx_type n = haystack.numel ();
78 
79  // We'll use deque because it typically has the most favorable properties for
80  // the operation we need.
81  std::deque<octave_idx_type> accum;
82  if (m == 1)
83  {
84  // Looking for a single character.
85  for (octave_idx_type i = 0; i < n; i++)
86  {
87  if (y[i] == x[0])
88  accum.push_back (i);
89  }
90  }
91  else if (m == 2)
92  {
93  // Two characters.
94  if (overlaps)
95  {
96  for (octave_idx_type i = 0; i < n-1; i++)
97  {
98  if (y[i] == x[0] && y[i+1] == x[1])
99  accum.push_back (i);
100  }
101  }
102  else
103  {
104  for (octave_idx_type i = 0; i < n-1; i++)
105  {
106  if (y[i] == x[0] && y[i+1] == x[1])
107  accum.push_back (i++);
108  }
109  }
110  }
111  else if (n >= m)
112  {
113  // General case.
114  octave_idx_type j = 0;
115 
116  if (overlaps)
117  {
118  while (j < n - m)
119  {
120  if (std::equal (x, x + m, y + j))
121  accum.push_back (j);
122  j += table[ORD(y[j + m])];
123  }
124  }
125  else
126  {
127  while (j < n - m)
128  {
129  if (std::equal (x, x + m, y + j))
130  {
131  accum.push_back (j);
132  j += m;
133  }
134  else
135  j += table[ORD(y[j + m])];
136  }
137  }
138 
139  if (j == n - m && std::equal (x, x + m, y + j))
140  accum.push_back (j);
141  }
142 
143  octave_idx_type nmatch = accum.size ();
144  octave_idx_type one = 1;
145  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
146  octave_idx_type k = 0;
147  for (const auto& idx : accum)
148  result.xelem (k++) = idx;
149 
150  return result;
151 }
152 
153 DEFUN (strfind, args, ,
154  doc: /* -*- texinfo -*-
155 @deftypefn {} {@var{idx} =} strfind (@var{str}, @var{pattern})
156 @deftypefnx {} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})
157 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "overlaps", @var{val})
158 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "forcecelloutput", @var{val})
159 Search for @var{pattern} in the string @var{str} and return the starting
160 index of every such occurrence in the vector @var{idx}.
161 
162 If there is no such occurrence, or if @var{pattern} is longer than
163 @var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty
164 array @code{[]}.
165 
166 The optional argument @qcode{"overlaps"} determines whether the pattern
167 can match at every position in @var{str} (true), or only for unique
168 occurrences of the complete pattern (false). The default is true.
169 
170 If a cell array of strings @var{cellstr} is specified then @var{idx} is a
171 cell array of vectors, as specified above.
172 
173 The optional argument @qcode{"forcecelloutput"} forces @var{idx} to be
174 returned as a cell array of vectors. The default is false.
175 
176 Examples:
177 
178 @example
179 @group
180 strfind ("abababa", "aba")
181  @result{} [1, 3, 5]
182 @end group
183 
184 @group
185 strfind ("abababa", "aba", "overlaps", false)
186  @result{} [1, 5]
187 @end group
188 
189 @group
190 strfind (@{"abababa", "bebebe", "ab"@}, "aba")
191  @result{}
192  @{
193  [1,1] =
194 
195  1 3 5
196 
197  [1,2] = [](1x0)
198  [1,3] = [](1x0)
199  @}
200 @end group
201 
202 @group
203 strfind ("abababa", "aba", "forcecelloutput", true)
204  @result{}
205  @{
206  [1,1] =
207 
208  1 3 5
209  @}
210 @end group
211 @end example
212 @seealso{regexp, regexpi, find}
213 @end deftypefn */)
214 {
215  int nargin = args.length ();
216 
217  if (nargin != 4 && nargin != 2)
218  print_usage ();
219 
220  bool overlaps = true;
221  bool forcecelloutput = false;
222  if (nargin == 4)
223  {
224  if (! args(2).is_string () || ! args(3).is_scalar_type ())
225  error ("strfind: invalid optional arguments");
226 
227  std::string opt = args(2).string_value ();
228  std::transform (opt.begin (), opt.end (), opt.begin (), tolower);
229 
230  if (opt == "overlaps")
231  overlaps = args(3).bool_value ();
232  else if (opt == "forcecelloutput")
233  forcecelloutput = args(3).bool_value ();
234  else
235  error ("strfind: unknown option: %s", opt.c_str ());
236  }
237 
238  octave_value retval;
239 
240  octave_value argstr = args(0);
241  octave_value argpat = args(1);
242 
243  if (argpat.is_string ())
244  {
245  Array<char> needle = argpat.char_array_value ();
247  qs_preprocess (needle, table);
248 
249  if (argstr.is_string ())
250  {
251  if (argpat.isempty ())
252  // Return a null matrix for null pattern for MW compatibility
253  retval = Matrix ();
254  else
255  retval = octave_value (qs_search (needle,
256  argstr.char_array_value (),
257  table, overlaps),
258  true, true);
259  if (forcecelloutput)
260  retval = Cell (retval);
261  }
262  else if (argstr.iscell ())
263  {
264  const Cell argsc = argstr.cell_value ();
265  Cell retc (argsc.dims ());
266  octave_idx_type ns = argsc.numel ();
267 
268  for (octave_idx_type i = 0; i < ns; i++)
269  {
270  octave_value argse = argsc(i);
271  if (! argse.is_string ())
272  error ("strfind: each element of CELLSTR must be a string");
273 
274  if (argpat.isempty ())
275  retc(i) = Matrix ();
276  else
277  retc(i) = octave_value (qs_search (needle,
278  argse.char_array_value (),
279  table, overlaps),
280  true, true);
281  }
282 
283  retval = retc;
284  }
285  else
286  error ("strfind: first argument must be a string or cell array of strings");
287  }
288  else if (argpat.iscell ())
289  retval = do_simple_cellfun (Fstrfind, "strfind", args);
290  else
291  error ("strfind: PATTERN must be a string or cell array of strings");
292 
293  return retval;
294 }
295 
296 /*
297 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
298 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
299 %!assert (strfind ("abababa", "aba", "forcecelloutput", false), [1, 3, 5])
300 %!assert (strfind ("abababa", "aba", "forcecelloutput", true), {[1, 3, 5]})
301 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
302 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", false),
303 %! {[1, 3, 5, 7], 3, 3})
304 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", true),
305 %! {[1, 3, 5, 7], 3, 3})
306 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
307 %!assert (strfind ("abc", ""), [])
308 %!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
309 %!assert (strfind ({"abc", "def"}, ""), {[], []})
310 
311 %!error strfind ()
312 %!error strfind ("foo", "bar", 1)
313 %!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
314 %!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
315 %!error <first argument must be a string> strfind (100, "foo")
316 %!error <PATTERN must be a string> strfind ("foo", 100)
317 */
318 
319 static Array<char>
320 qs_replace (const Array<char>& str, const Array<char>& pat,
321  const Array<char>& rep,
322  const octave_idx_type *table,
323  bool overlaps = true)
324 {
325  Array<char> ret = str;
326 
327  octave_idx_type siz = str.numel ();
328  octave_idx_type psiz = pat.numel ();
329  octave_idx_type rsiz = rep.numel ();
330 
331  if (psiz != 0)
332  {
333  // Look up matches, without overlaps.
334  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
335  octave_idx_type nidx = idx.numel ();
336 
337  if (nidx)
338  {
339  // Compute result size.
340  octave_idx_type retsiz;
341  if (overlaps)
342  {
343  retsiz = 0;
344  // OMG. Is this the "right answer" MW always looks for, or
345  // someone was just lazy?
346  octave_idx_type k = 0;
347  for (octave_idx_type i = 0; i < nidx; i++)
348  {
349  octave_idx_type j = idx(i);
350  if (j >= k)
351  retsiz += j - k;
352  retsiz += rsiz;
353  k = j + psiz;
354  }
355 
356  retsiz += siz - k;
357  }
358  else
359  retsiz = siz + nidx * (rsiz - psiz);
360 
361  if (retsiz == 0)
362  ret.clear (dim_vector (0, 0));
363  else
364  {
365  ret.clear (dim_vector (1, retsiz));
366  const char *src = str.data ();
367  const char *reps = rep.data ();
368  char *dest = ret.fortran_vec ();
369 
370  octave_idx_type k = 0;
371  for (octave_idx_type i = 0; i < nidx; i++)
372  {
373  octave_idx_type j = idx(i);
374  if (j >= k)
375  dest = std::copy (src + k, src + j, dest);
376  dest = std::copy (reps, reps + rsiz, dest);
377  k = j + psiz;
378  }
379 
380  std::copy (src + k, src + siz, dest);
381  }
382  }
383  }
384 
385  return ret;
386 }
387 
388 DEFUN (strrep, args, ,
389  doc: /* -*- texinfo -*-
390 @deftypefn {} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})
391 @deftypefnx {} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})
392 @deftypefnx {} {@var{newstr} =} strrep (@dots{}, "overlaps", @var{val})
393 Replace all occurrences of the pattern @var{ptn} in the string @var{str}
394 with the string @var{rep} and return the result.
395 
396 The optional argument @qcode{"overlaps"} determines whether the pattern
397 can match at every position in @var{str} (true), or only for unique
398 occurrences of the complete pattern (false). The default is true.
399 
400 @var{s} may also be a cell array of strings, in which case the replacement
401 is done for each element and a cell array is returned.
402 
403 Example:
404 
405 @example
406 @group
407 strrep ("This is a test string", "is", "&%$")
408  @result{} "Th&%$ &%$ a test string"
409 @end group
410 @end example
411 
412 @seealso{regexprep, strfind}
413 @end deftypefn */)
414 {
415  int nargin = args.length ();
416 
417  if (nargin != 3 && nargin != 5)
418  print_usage ();
419 
420  bool overlaps = true;
421 
422  if (nargin == 5)
423  {
424  if (! args(3).is_string () || ! args(4).is_scalar_type ())
425  error ("strrep: invalid optional argument");
426 
427  std::string opt = args(3).string_value ();
428  if (opt != "overlaps")
429  error ("strrep: unknown option: %s", opt.c_str ());
430 
431  overlaps = args(4).bool_value ();
432  }
433 
434  octave_value retval;
435 
436  // Aliasing for better code readability
437  octave_value argstr = args(0);
438  octave_value argpat = args(1);
439  octave_value argrep = args(2);
440 
441  if (argpat.is_string () && argrep.is_string ())
442  {
443  const Array<char> pat = argpat.char_array_value ();
444  const Array<char> rep = argrep.char_array_value ();
445 
447  qs_preprocess (pat, table);
448 
449  if (argstr.is_string ())
450  if (argstr.rows () == 1) // most common case of a single string
451  retval = qs_replace (argstr.char_array_value (), pat, rep,
452  table, overlaps);
453  else
454  {
455  const charMatrix argchm = argstr.char_matrix_value ();
456  octave_idx_type nel = argchm.rows ();
457  octave_idx_type nc = argchm.columns ();
458  charMatrix retchm (nel, 0);
459 
460  for (octave_idx_type i = 0; i < nel; i++)
461  {
462  charMatrix rowchm;
463  rowchm = qs_replace (argchm.extract (i, 0, i, nc-1),
464  pat, rep, table, overlaps);
465  retchm.insert (rowchm, i, 0);
466  }
467 
468  retval = retchm;
469  }
470  else if (argstr.iscell ())
471  {
472  const Cell argcell = argstr.cell_value ();
473  if (! argcell.iscellstr ())
474  error ("strrep: each element of S must be a string");
475 
476  Cell retcell (argcell.dims ());
477  octave_idx_type nel = argcell.numel ();
478 
479  for (octave_idx_type i = 0; i < nel; i++)
480  {
481  retcell(i) = qs_replace (argcell(i).char_array_value (),
482  pat, rep, table, overlaps);
483  }
484 
485  retval = retcell;
486  }
487  else
488  error ("strrep: S must be a string or cell array of strings");
489  }
490  else if (argpat.iscell () || argrep.iscell ())
491  retval = do_simple_cellfun (Fstrrep, "strrep", args);
492  else
493  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
494 
495  return retval;
496 }
497 
498 /*
499 %!assert (strrep ("This is a test string", "is", "&%$"),
500 %! "Th&%$ &%$ a test string")
501 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
502 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
503 %!assert (strrep ({"Hello World"; "Goodbye World"}, "World", "Jane"),
504 %! {"Hello Jane"; "Goodbye Jane"})
505 %!assert (strrep (char ("Hello World", "Goodbye World"), "World", "Jane"),
506 %! char ("Hello Jane", "Goodbye Jane"))
507 
508 %!assert (size (strrep ("a", "a", "")), [0 0])
509 
510 %!error strrep ()
511 %!error strrep ("A")
512 %!error strrep ("A", "B")
513 %!error strrep ("A", "B", "C", "D")
514 %!error strrep ("A", "B", "C", "D", "E", "F")
515 %!error <invalid optional argument> strrep ("A", "B", "C", 3, true)
516 %!error <invalid optional argument> strrep ("A", "B", "C", "str", ones (2,2))
517 %!error <unknown option: foobar> strrep ("A", "B", "C", "foobar", true)
518 %!error <each element of S must be a string> strrep ({"A", 1.0}, "B", "C")
519 %!error <S must be a string or cell array of strings> strrep (1.0, "B", "C")
520 %!error <PTN and REP arguments must be strings> strrep ("A", 1.0, "C")
521 %!error <PTN and REP arguments must be strings> strrep ("A", "B", 1.0)
522 */
523 
524 OCTAVE_END_NAMESPACE(octave)
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
T * fortran_vec()
Size of the specified dimension.
Definition: Array-base.cc:1764
void clear()
Definition: Array-base.cc:109
octave_idx_type rows() const
Definition: Array.h:459
const T * data() const
Size of the specified dimension.
Definition: Array.h:663
octave_idx_type columns() const
Definition: Array.h:471
const dim_vector & dims() const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:503
octave_idx_type numel() const
Number of elements in the array.
Definition: Array.h:414
Definition: Cell.h:43
bool iscellstr() const
Definition: Cell.cc:126
Definition: dMatrix.h:42
charMatrix extract(octave_idx_type r1, octave_idx_type c1, octave_idx_type r2, octave_idx_type c2) const
Definition: chMatrix.cc:116
charMatrix & insert(const char *s, octave_idx_type r, octave_idx_type c)
Definition: chMatrix.cc:59
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
charMatrix char_matrix_value(bool frc_str_conv=false) const
Definition: ov.h:894
Cell cell_value() const
octave_idx_type rows() const
Definition: ov.h:545
bool is_string() const
Definition: ov.h:637
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:897
bool isempty() const
Definition: ov.h:601
bool iscell() const
Definition: ov.h:604
OCTAVE_BEGIN_NAMESPACE(octave) static octave_value daspk_fcn
void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void() error(const char *fmt,...)
Definition: error.cc:988
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5468
F77_RET_T const F77_DBLE * x
T octave_idx_type m
Definition: mx-inlines.cc:781
octave_idx_type n
Definition: mx-inlines.cc:761
octave_value_list do_simple_cellfun(octave_value_list(*fcn)(const octave_value_list &, int), const char *fcn_name, const octave_value_list &args, int nargout)
Definition: utils.cc:1679
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value_list Fstrfind(const octave_value_list &args, int)
Definition: strfind.cc:213
#define TABSIZE
Definition: strfind.cc:51
octave_value_list Fstrrep(const octave_value_list &args, int)
Definition: strfind.cc:413
#define ORD(ch)
Definition: strfind.cc:50