GNU Octave  9.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
strfind.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2009-2024 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <algorithm>
31 #include <deque>
32 #include <limits>
33 #include <string>
34 
35 #include "oct-locbuf.h"
36 
37 #include "Cell.h"
38 #include "builtin-defun-decls.h"
39 #include "defun.h"
40 #include "errwarn.h"
41 #include "error.h"
42 #include "ov.h"
43 #include "unwind-prot.h"
44 #include "utils.h"
45 
47 
48 // This allows safe indexing with char.
49 // In C++, char may be (and often is) signed!
50 #define ORD(ch) static_cast<unsigned char> (ch)
51 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
52 
53 // This is the quick search algorithm, as described at
54 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
55 static void
56 qs_preprocess (const Array<char>& needle,
57  octave_idx_type *table)
58 {
59  const char *x = needle.data ();
60  octave_idx_type m = needle.numel ();
61 
62  for (octave_idx_type i = 0; i < TABSIZE; i++)
63  table[i] = m + 1;
64  for (octave_idx_type i = 0; i < m; i++)
65  table[ORD(x[i])] = m - i;
66 }
67 
69 qs_search (const Array<char>& needle,
70  const Array<char>& haystack,
71  const octave_idx_type *table,
72  bool overlaps = true)
73 {
74  const char *x = needle.data ();
75  octave_idx_type m = needle.numel ();
76  const char *y = haystack.data ();
77  octave_idx_type n = haystack.numel ();
78 
79  // We'll use deque because it typically has the most favorable properties for
80  // the operation we need.
81  std::deque<octave_idx_type> accum;
82  if (m == 1)
83  {
84  // Looking for a single character.
85  for (octave_idx_type i = 0; i < n; i++)
86  {
87  if (y[i] == x[0])
88  accum.push_back (i);
89  }
90  }
91  else if (m == 2)
92  {
93  // Two characters.
94  if (overlaps)
95  {
96  for (octave_idx_type i = 0; i < n-1; i++)
97  {
98  if (y[i] == x[0] && y[i+1] == x[1])
99  accum.push_back (i);
100  }
101  }
102  else
103  {
104  for (octave_idx_type i = 0; i < n-1; i++)
105  {
106  if (y[i] == x[0] && y[i+1] == x[1])
107  accum.push_back (i++);
108  }
109  }
110  }
111  else if (n >= m)
112  {
113  // General case.
114  octave_idx_type j = 0;
115 
116  if (overlaps)
117  {
118  while (j < n - m)
119  {
120  if (std::equal (x, x + m, y + j))
121  accum.push_back (j);
122  j += table[ORD(y[j + m])];
123  }
124  }
125  else
126  {
127  while (j < n - m)
128  {
129  if (std::equal (x, x + m, y + j))
130  {
131  accum.push_back (j);
132  j += m;
133  }
134  else
135  j += table[ORD(y[j + m])];
136  }
137  }
138 
139  if (j == n - m && std::equal (x, x + m, y + j))
140  accum.push_back (j);
141  }
142 
143  octave_idx_type nmatch = accum.size ();
144  octave_idx_type one = 1;
145  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
146  octave_idx_type k = 0;
147  for (const auto& idx : accum)
148  result.xelem (k++) = idx;
149 
150  return result;
151 }
152 
153 DEFUN (strfind, args, ,
154  doc: /* -*- texinfo -*-
155 @deftypefn {} {@var{idx} =} strfind (@var{str}, @var{pattern})
156 @deftypefnx {} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})
157 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "overlaps", @var{val})
158 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "forcecelloutput", @var{val})
159 Search for @var{pattern} in the string @var{str} and return the starting
160 index of every such occurrence in the vector @var{idx}.
161 
162 If there is no such occurrence, or if @var{pattern} is longer than
163 @var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty
164 array @code{[]}.
165 
166 The optional argument @qcode{"overlaps"} determines whether the pattern
167 can match at every position in @var{str} (true), or only for unique
168 occurrences of the complete pattern (false). The default is true.
169 
170 If a cell array of strings @var{cellstr} is specified then @var{idx} is a
171 cell array of vectors, as specified above.
172 
173 The optional argument @qcode{"forcecelloutput"} forces @var{idx} to be
174 returned as a cell array of vectors. The default is false.
175 
176 Examples:
177 
178 @example
179 @group
180 strfind ("abababa", "aba")
181  @result{} [1, 3, 5]
182 @end group
183 
184 @group
185 strfind ("abababa", "aba", "overlaps", false)
186  @result{} [1, 5]
187 @end group
188 
189 @group
190 strfind (@{"abababa", "bebebe", "ab"@}, "aba")
191  @result{}
192  @{
193  [1,1] =
194 
195  1 3 5
196 
197  [1,2] = [](1x0)
198  [1,3] = [](1x0)
199  @}
200 @end group
201 
202 @group
203 strfind ("abababa", "aba", "forcecelloutput", true)
204  @result{}
205  @{
206  [1,1] =
207 
208  1 3 5
209  @}
210 @end group
211 @end example
212 @seealso{regexp, regexpi, find}
213 @end deftypefn */)
214 {
215  int nargin = args.length ();
216 
217  if (nargin != 4 && nargin != 2)
218  print_usage ();
219 
220  bool overlaps = true;
221  bool forcecelloutput = false;
222  if (nargin == 4)
223  {
224  if (! args(2).is_string () || ! args(3).is_scalar_type ())
225  error ("strfind: invalid optional arguments");
226 
227  std::string opt = args(2).string_value ();
228  std::transform (opt.begin (), opt.end (), opt.begin (), tolower);
229 
230  if (opt == "overlaps")
231  overlaps = args(3).bool_value ();
232  else if (opt == "forcecelloutput")
233  forcecelloutput = args(3).bool_value ();
234  else
235  error ("strfind: unknown option: %s", opt.c_str ());
236  }
237 
238  octave_value retval;
239 
240  octave_value argstr = args(0);
241  octave_value argpat = args(1);
242 
243  if (argpat.is_string ())
244  {
245  Array<char> needle = argpat.char_array_value ();
247  qs_preprocess (needle, table);
248 
249  if (argstr.is_string ())
250  {
251  if (argpat.isempty ())
252  // Return a null matrix for null pattern for MW compatibility
253  retval = Matrix ();
254  else
255  retval = octave_value (qs_search (needle,
256  argstr.char_array_value (),
257  table, overlaps),
258  true, true);
259  if (forcecelloutput)
260  retval = Cell (retval);
261  }
262  else if (argstr.iscell ())
263  {
264  const Cell argsc = argstr.cell_value ();
265  Cell retc (argsc.dims ());
266  octave_idx_type ns = argsc.numel ();
267 
268  for (octave_idx_type i = 0; i < ns; i++)
269  {
270  octave_value argse = argsc(i);
271  if (! argse.is_string ())
272  error ("strfind: each element of CELLSTR must be a string");
273 
274  if (argpat.isempty ())
275  retc(i) = Matrix ();
276  else
277  retc(i) = octave_value (qs_search (needle,
278  argse.char_array_value (),
279  table, overlaps),
280  true, true);
281  }
282 
283  retval = retc;
284  }
285  else
286  error ("strfind: first argument must be a string or cell array of strings");
287  }
288  else if (argpat.iscell ())
289  retval = do_simple_cellfun (Fstrfind, "strfind", args);
290  else
291  error ("strfind: PATTERN must be a string or cell array of strings");
292 
293  return retval;
294 }
295 
296 /*
297 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
298 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
299 %!assert (strfind ("abababa", "aba", "forcecelloutput", false), [1, 3, 5])
300 %!assert (strfind ("abababa", "aba", "forcecelloutput", true), {[1, 3, 5]})
301 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
302 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", false),
303 %! {[1, 3, 5, 7], 3, 3})
304 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", true),
305 %! {[1, 3, 5, 7], 3, 3})
306 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
307 %!assert (strfind ("abc", ""), [])
308 %!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
309 %!assert (strfind ({"abc", "def"}, ""), {[], []})
310 
311 %!error strfind ()
312 %!error strfind ("foo", "bar", 1)
313 %!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
314 %!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
315 %!error <first argument must be a string> strfind (100, "foo")
316 %!error <PATTERN must be a string> strfind ("foo", 100)
317 */
318 
319 static Array<char>
320 qs_replace (const Array<char>& str, const Array<char>& pat,
321  const Array<char>& rep,
322  const octave_idx_type *table,
323  bool overlaps = true)
324 {
325  Array<char> ret = str;
326 
327  octave_idx_type siz = str.numel ();
328  octave_idx_type psiz = pat.numel ();
329  octave_idx_type rsiz = rep.numel ();
330 
331  if (psiz != 0)
332  {
333  // Look up matches, without overlaps.
334  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
335  octave_idx_type nidx = idx.numel ();
336 
337  if (nidx)
338  {
339  // Compute result size.
340  octave_idx_type retsiz;
341  if (overlaps)
342  {
343  retsiz = 0;
344  // OMG. Is this the "right answer" MW always looks for, or
345  // someone was just lazy?
346  octave_idx_type k = 0;
347  for (octave_idx_type i = 0; i < nidx; i++)
348  {
349  octave_idx_type j = idx(i);
350  if (j >= k)
351  retsiz += j - k;
352  retsiz += rsiz;
353  k = j + psiz;
354  }
355 
356  retsiz += siz - k;
357  }
358  else
359  retsiz = siz + nidx * (rsiz - psiz);
360 
361  if (retsiz == 0)
362  ret.clear (dim_vector (0, 0));
363  else
364  {
365  ret.clear (dim_vector (1, retsiz));
366  const char *src = str.data ();
367  const char *reps = rep.data ();
368  char *dest = ret.fortran_vec ();
369 
370  octave_idx_type k = 0;
371  for (octave_idx_type i = 0; i < nidx; i++)
372  {
373  octave_idx_type j = idx(i);
374  if (j >= k)
375  dest = std::copy (src + k, src + j, dest);
376  dest = std::copy (reps, reps + rsiz, dest);
377  k = j + psiz;
378  }
379 
380  std::copy (src + k, src + siz, dest);
381  }
382  }
383  }
384 
385  return ret;
386 }
387 
388 DEFUN (strrep, args, ,
389  doc: /* -*- texinfo -*-
390 @deftypefn {} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})
391 @deftypefnx {} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})
392 @deftypefnx {} {@var{newstr} =} strrep (@dots{}, "overlaps", @var{val})
393 Replace all occurrences of the pattern @var{ptn} in the string @var{str}
394 with the string @var{rep} and return the result.
395 
396 The optional argument @qcode{"overlaps"} determines whether the pattern
397 can match at every position in @var{str} (true), or only for unique
398 occurrences of the complete pattern (false). The default is true.
399 
400 @var{s} may also be a cell array of strings, in which case the replacement
401 is done for each element and a cell array is returned.
402 
403 Example:
404 
405 @example
406 @group
407 strrep ("This is a test string", "is", "&%$")
408  @result{} "Th&%$ &%$ a test string"
409 @end group
410 @end example
411 
412 @seealso{regexprep, strfind}
413 @end deftypefn */)
414 {
415  int nargin = args.length ();
416 
417  if (nargin != 3 && nargin != 5)
418  print_usage ();
419 
420  bool overlaps = true;
421 
422  if (nargin == 5)
423  {
424  if (! args(3).is_string () || ! args(4).is_scalar_type ())
425  error ("strrep: invalid optional argument");
426 
427  std::string opt = args(3).string_value ();
428  if (opt != "overlaps")
429  error ("strrep: unknown option: %s", opt.c_str ());
430 
431  overlaps = args(4).bool_value ();
432  }
433 
434  octave_value retval;
435 
436  // Aliasing for better code readability
437  octave_value argstr = args(0);
438  octave_value argpat = args(1);
439  octave_value argrep = args(2);
440 
441  if (argpat.is_string () && argrep.is_string ())
442  {
443  const Array<char> pat = argpat.char_array_value ();
444  const Array<char> rep = argrep.char_array_value ();
445 
447  qs_preprocess (pat, table);
448 
449  if (argstr.is_string ())
450  if (argstr.rows () == 1) // most common case of a single string
451  retval = qs_replace (argstr.char_array_value (), pat, rep,
452  table, overlaps);
453  else
454  {
455  const charMatrix argchm = argstr.char_matrix_value ();
456  octave_idx_type nel = argchm.rows ();
457  octave_idx_type nc = argchm.columns ();
458  charMatrix retchm (nel, 0);
459 
460  for (octave_idx_type i = 0; i < nel; i++)
461  {
462  charMatrix rowchm;
463  rowchm = qs_replace (argchm.extract (i, 0, i, nc-1),
464  pat, rep, table, overlaps);
465  retchm.insert (rowchm, i, 0);
466  }
467 
468  retval = retchm;
469  }
470  else if (argstr.iscell ())
471  {
472  const Cell argcell = argstr.cell_value ();
473  if (! argcell.iscellstr ())
474  error ("strrep: each element of S must be a string");
475 
476  Cell retcell (argcell.dims ());
477  octave_idx_type nel = argcell.numel ();
478 
479  for (octave_idx_type i = 0; i < nel; i++)
480  {
481  retcell(i) = qs_replace (argcell(i).char_array_value (),
482  pat, rep, table, overlaps);
483  }
484 
485  retval = retcell;
486  }
487  else
488  error ("strrep: S must be a string or cell array of strings");
489  }
490  else if (argpat.iscell () || argrep.iscell ())
491  retval = do_simple_cellfun (Fstrrep, "strrep", args);
492  else
493  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
494 
495  return retval;
496 }
497 
498 /*
499 %!assert (strrep ("This is a test string", "is", "&%$"),
500 %! "Th&%$ &%$ a test string")
501 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
502 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
503 %!assert (strrep ({"Hello World"; "Goodbye World"}, "World", "Jane"),
504 %! {"Hello Jane"; "Goodbye Jane"})
505 %!assert (strrep (char ("Hello World", "Goodbye World"), "World", "Jane"),
506 %! char ("Hello Jane", "Goodbye Jane"))
507 
508 %!assert (size (strrep ("a", "a", "")), [0 0])
509 
510 %!error strrep ()
511 %!error strrep ("A")
512 %!error strrep ("A", "B")
513 %!error strrep ("A", "B", "C", "D")
514 %!error strrep ("A", "B", "C", "D", "E", "F")
515 %!error <invalid optional argument> strrep ("A", "B", "C", 3, true)
516 %!error <invalid optional argument> strrep ("A", "B", "C", "str", ones (2,2))
517 %!error <unknown option: foobar> strrep ("A", "B", "C", "foobar", true)
518 %!error <each element of S must be a string> strrep ({"A", 1.0}, "B", "C")
519 %!error <S must be a string or cell array of strings> strrep (1.0, "B", "C")
520 %!error <PTN and REP arguments must be strings> strrep ("A", 1.0, "C")
521 %!error <PTN and REP arguments must be strings> strrep ("A", "B", 1.0)
522 */
523 
524 OCTAVE_END_NAMESPACE(octave)
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
T * fortran_vec()
Size of the specified dimension.
Definition: Array-base.cc:1764
void clear()
Definition: Array-base.cc:109
octave_idx_type rows() const
Definition: Array.h:459
const T * data() const
Size of the specified dimension.
Definition: Array.h:663
octave_idx_type columns() const
Definition: Array.h:471
const dim_vector & dims() const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:503
octave_idx_type numel() const
Number of elements in the array.
Definition: Array.h:414
Definition: Cell.h:43
bool iscellstr() const
Definition: Cell.cc:126
Definition: dMatrix.h:42
charMatrix extract(octave_idx_type r1, octave_idx_type c1, octave_idx_type r2, octave_idx_type c2) const
Definition: chMatrix.cc:116
charMatrix & insert(const char *s, octave_idx_type r, octave_idx_type c)
Definition: chMatrix.cc:59
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
charMatrix char_matrix_value(bool frc_str_conv=false) const
Definition: ov.h:894
Cell cell_value() const
octave_idx_type rows() const
Definition: ov.h:545
bool is_string() const
Definition: ov.h:637
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:897
bool isempty() const
Definition: ov.h:601
bool iscell() const
Definition: ov.h:604
OCTAVE_BEGIN_NAMESPACE(octave) static octave_value daspk_fcn
void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void() error(const char *fmt,...)
Definition: error.cc:988
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5468
F77_RET_T const F77_DBLE * x
T octave_idx_type m
Definition: mx-inlines.cc:781
octave_idx_type n
Definition: mx-inlines.cc:761
octave_value_list do_simple_cellfun(octave_value_list(*fcn)(const octave_value_list &, int), const char *fcn_name, const octave_value_list &args, int nargout)
Definition: utils.cc:1679
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value_list Fstrfind(const octave_value_list &args, int)
Definition: strfind.cc:213
#define TABSIZE
Definition: strfind.cc:51
octave_value_list Fstrrep(const octave_value_list &args, int)
Definition: strfind.cc:413
#define ORD(ch)
Definition: strfind.cc:50