GNU Octave  6.2.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
strfind.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2009-2021 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <algorithm>
31 #include <deque>
32 #include <limits>
33 #include <string>
34 
35 #include "oct-locbuf.h"
36 
37 #include "Cell.h"
38 #include "builtin-defun-decls.h"
39 #include "defun.h"
40 #include "errwarn.h"
41 #include "error.h"
42 #include "ov.h"
43 #include "unwind-prot.h"
44 #include "utils.h"
45 
46 // This allows safe indexing with char.
47 // In C++, char may be (and often is) signed!
48 #define ORD(ch) static_cast<unsigned char>(ch)
49 #define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
50 
51 // This is the quick search algorithm, as described at
52 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
53 static void
54 qs_preprocess (const Array<char>& needle,
55  octave_idx_type *table)
56 {
57  const char *x = needle.data ();
58  octave_idx_type m = needle.numel ();
59 
60  for (octave_idx_type i = 0; i < TABSIZE; i++)
61  table[i] = m + 1;
62  for (octave_idx_type i = 0; i < m; i++)
63  table[ORD(x[i])] = m - i;
64 }
65 
67 qs_search (const Array<char>& needle,
68  const Array<char>& haystack,
69  const octave_idx_type *table,
70  bool overlaps = true)
71 {
72  const char *x = needle.data ();
73  octave_idx_type m = needle.numel ();
74  const char *y = haystack.data ();
75  octave_idx_type n = haystack.numel ();
76 
77  // We'll use deque because it typically has the most favorable properties for
78  // the operation we need.
79  std::deque<octave_idx_type> accum;
80  if (m == 1)
81  {
82  // Looking for a single character.
83  for (octave_idx_type i = 0; i < n; i++)
84  {
85  if (y[i] == x[0])
86  accum.push_back (i);
87  }
88  }
89  else if (m == 2)
90  {
91  // Two characters.
92  if (overlaps)
93  {
94  for (octave_idx_type i = 0; i < n-1; i++)
95  {
96  if (y[i] == x[0] && y[i+1] == x[1])
97  accum.push_back (i);
98  }
99  }
100  else
101  {
102  for (octave_idx_type i = 0; i < n-1; i++)
103  {
104  if (y[i] == x[0] && y[i+1] == x[1])
105  accum.push_back (i++);
106  }
107  }
108  }
109  else if (n >= m)
110  {
111  // General case.
112  octave_idx_type j = 0;
113 
114  if (overlaps)
115  {
116  while (j < n - m)
117  {
118  if (std::equal (x, x + m, y + j))
119  accum.push_back (j);
120  j += table[ORD(y[j + m])];
121  }
122  }
123  else
124  {
125  while (j < n - m)
126  {
127  if (std::equal (x, x + m, y + j))
128  {
129  accum.push_back (j);
130  j += m;
131  }
132  else
133  j += table[ORD(y[j + m])];
134  }
135  }
136 
137  if (j == n - m && std::equal (x, x + m, y + j))
138  accum.push_back (j);
139  }
140 
141  octave_idx_type nmatch = accum.size ();
142  octave_idx_type one = 1;
143  Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
144  octave_idx_type k = 0;
145  for (const auto& idx : accum)
146  result.xelem (k++) = idx;
147 
148  return result;
149 }
150 
151 DEFUN (strfind, args, ,
152  doc: /* -*- texinfo -*-
153 @deftypefn {} {@var{idx} =} strfind (@var{str}, @var{pattern})
154 @deftypefnx {} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})
155 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "overlaps", @var{val})
156 @deftypefnx {} {@var{idx} =} strfind (@dots{}, "forcecelloutput", @var{val})
157 Search for @var{pattern} in the string @var{str} and return the starting
158 index of every such occurrence in the vector @var{idx}.
159 
160 If there is no such occurrence, or if @var{pattern} is longer than
161 @var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty
162 array @code{[]}.
163 
164 The optional argument @qcode{"overlaps"} determines whether the pattern
165 can match at every position in @var{str} (true), or only for unique
166 occurrences of the complete pattern (false). The default is true.
167 
168 If a cell array of strings @var{cellstr} is specified then @var{idx} is a
169 cell array of vectors, as specified above.
170 
171 The optional argument @qcode{"forcecelloutput"} forces @var{idx} to be
172 returned as a cell array of vectors. The default is false.
173 
174 Examples:
175 
176 @example
177 @group
178 strfind ("abababa", "aba")
179  @result{} [1, 3, 5]
180 @end group
181 
182 @group
183 strfind ("abababa", "aba", "overlaps", false)
184  @result{} [1, 5]
185 @end group
186 
187 @group
188 strfind (@{"abababa", "bebebe", "ab"@}, "aba")
189  @result{}
190  @{
191  [1,1] =
192 
193  1 3 5
194 
195  [1,2] = [](1x0)
196  [1,3] = [](1x0)
197  @}
198 @end group
199 
200 @group
201 strfind ("abababa", "aba", "forcecelloutput", true)
202  @result{}
203  @{
204  [1,1] =
205 
206  1 3 5
207  @}
208 @end group
209 @end example
210 @seealso{regexp, regexpi, find}
211 @end deftypefn */)
212 {
213  int nargin = args.length ();
214 
215  if (nargin != 4 && nargin != 2)
216  print_usage ();
217 
218  bool overlaps = true;
219  bool forcecelloutput = false;
220  if (nargin == 4)
221  {
222  if (! args(2).is_string () || ! args(3).is_scalar_type ())
223  error ("strfind: invalid optional arguments");
224 
225  std::string opt = args(2).string_value ();
226  std::transform (opt.begin (), opt.end (), opt.begin (), tolower);
227 
228  if (opt == "overlaps")
229  overlaps = args(3).bool_value ();
230  else if (opt == "forcecelloutput")
231  forcecelloutput = args(3).bool_value ();
232  else
233  error ("strfind: unknown option: %s", opt.c_str ());
234  }
235 
237 
238  octave_value argstr = args(0);
239  octave_value argpat = args(1);
240 
241  if (argpat.is_string ())
242  {
243  Array<char> needle = argpat.char_array_value ();
245  qs_preprocess (needle, table);
246 
247  if (argstr.is_string ())
248  {
249  if (argpat.isempty ())
250  // Return a null matrix for null pattern for MW compatibility
251  retval = Matrix ();
252  else
253  retval = octave_value (qs_search (needle,
254  argstr.char_array_value (),
255  table, overlaps),
256  true, true);
257  if (forcecelloutput)
258  retval = Cell (retval);
259  }
260  else if (argstr.iscell ())
261  {
262  const Cell argsc = argstr.cell_value ();
263  Cell retc (argsc.dims ());
264  octave_idx_type ns = argsc.numel ();
265 
266  for (octave_idx_type i = 0; i < ns; i++)
267  {
268  octave_value argse = argsc(i);
269  if (! argse.is_string ())
270  error ("strfind: each element of CELLSTR must be a string");
271 
272  if (argpat.isempty ())
273  retc(i) = Matrix ();
274  else
275  retc(i) = octave_value (qs_search (needle,
276  argse.char_array_value (),
277  table, overlaps),
278  true, true);
279  }
280 
281  retval = retc;
282  }
283  else
284  error ("strfind: first argument must be a string or cell array of strings");
285  }
286  else if (argpat.iscell ())
287  retval = octave::do_simple_cellfun (Fstrfind, "strfind", args);
288  else
289  error ("strfind: PATTERN must be a string or cell array of strings");
290 
291  return retval;
292 }
293 
294 /*
295 %!assert (strfind ("abababa", "aba"), [1, 3, 5])
296 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
297 %!assert (strfind ("abababa", "aba", "forcecelloutput", false), [1, 3, 5])
298 %!assert (strfind ("abababa", "aba", "forcecelloutput", true), {[1, 3, 5]})
299 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
300 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", false), {[1, 3, 5, 7], 3, 3})
301 %!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", true), {[1, 3, 5, 7], 3, 3})
302 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
303 %!assert (strfind ("abc", ""), [])
304 %!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
305 %!assert (strfind ({"abc", "def"}, ""), {[], []})
306 
307 %!error strfind ()
308 %!error strfind ("foo", "bar", 1)
309 %!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
310 %!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
311 %!error <first argument must be a string> strfind (100, "foo")
312 %!error <PATTERN must be a string> strfind ("foo", 100)
313 */
314 
315 static Array<char>
316 qs_replace (const Array<char>& str, const Array<char>& pat,
317  const Array<char>& rep,
318  const octave_idx_type *table,
319  bool overlaps = true)
320 {
321  Array<char> ret = str;
322 
323  octave_idx_type siz = str.numel ();
324  octave_idx_type psiz = pat.numel ();
325  octave_idx_type rsiz = rep.numel ();
326 
327  if (psiz != 0)
328  {
329  // Look up matches, without overlaps.
330  const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
331  octave_idx_type nidx = idx.numel ();
332 
333  if (nidx)
334  {
335  // Compute result size.
336  octave_idx_type retsiz;
337  if (overlaps)
338  {
339  retsiz = 0;
340  // OMG. Is this the "right answer" MW always looks for, or
341  // someone was just lazy?
342  octave_idx_type k = 0;
343  for (octave_idx_type i = 0; i < nidx; i++)
344  {
345  octave_idx_type j = idx(i);
346  if (j >= k)
347  retsiz += j - k;
348  retsiz += rsiz;
349  k = j + psiz;
350  }
351 
352  retsiz += siz - k;
353  }
354  else
355  retsiz = siz + nidx * (rsiz - psiz);
356 
357  if (retsiz == 0)
358  ret.clear (dim_vector (0, 0));
359  else
360  {
361  ret.clear (dim_vector (1, retsiz));
362  const char *src = str.data ();
363  const char *reps = rep.data ();
364  char *dest = ret.fortran_vec ();
365 
366  octave_idx_type k = 0;
367  for (octave_idx_type i = 0; i < nidx; i++)
368  {
369  octave_idx_type j = idx(i);
370  if (j >= k)
371  dest = std::copy (src + k, src + j, dest);
372  dest = std::copy (reps, reps + rsiz, dest);
373  k = j + psiz;
374  }
375 
376  std::copy (src + k, src + siz, dest);
377  }
378  }
379  }
380 
381  return ret;
382 }
383 
384 DEFUN (strrep, args, ,
385  doc: /* -*- texinfo -*-
386 @deftypefn {} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})
387 @deftypefnx {} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})
388 @deftypefnx {} {@var{newstr} =} strrep (@dots{}, "overlaps", @var{val})
389 Replace all occurrences of the pattern @var{ptn} in the string @var{str}
390 with the string @var{rep} and return the result.
391 
392 The optional argument @qcode{"overlaps"} determines whether the pattern
393 can match at every position in @var{str} (true), or only for unique
394 occurrences of the complete pattern (false). The default is true.
395 
396 @var{s} may also be a cell array of strings, in which case the replacement
397 is done for each element and a cell array is returned.
398 
399 Example:
400 
401 @example
402 @group
403 strrep ("This is a test string", "is", "&%$")
404  @result{} "Th&%$ &%$ a test string"
405 @end group
406 @end example
407 
408 @seealso{regexprep, strfind}
409 @end deftypefn */)
410 {
411  int nargin = args.length ();
412 
413  if (nargin != 3 && nargin != 5)
414  print_usage ();
415 
416  bool overlaps = true;
417 
418  if (nargin == 5)
419  {
420  if (! args(3).is_string () || ! args(4).is_scalar_type ())
421  error ("strrep: invalid optional arguments");
422 
423  std::string opt = args(3).string_value ();
424  if (opt == "overlaps")
425  overlaps = args(4).bool_value ();
426  else
427  error ("strrep: unknown option: %s", opt.c_str ());
428  }
429 
431 
432  octave_value argstr = args(0);
433  octave_value argpat = args(1);
434  octave_value argrep = args(2);
435 
436  if (argpat.is_string () && argrep.is_string ())
437  {
438  const Array<char> pat = argpat.char_array_value ();
439  const Array<char> rep = argrep.char_array_value ();
440 
442  qs_preprocess (pat, table);
443 
444  if (argstr.is_string ())
445  retval = qs_replace (argstr.char_array_value (), pat, rep,
446  table, overlaps);
447  else if (argstr.iscell ())
448  {
449  const Cell argsc = argstr.cell_value ();
450  Cell retc (argsc.dims ());
451  octave_idx_type ns = argsc.numel ();
452 
453  for (octave_idx_type i = 0; i < ns; i++)
454  {
455  octave_value argse = argsc(i);
456  if (argse.is_string ())
457  retc(i) = qs_replace (argse.char_array_value (), pat, rep,
458  table, overlaps);
459  else
460  error ("strrep: each element of S must be a string");
461  }
462 
463  retval = retc;
464  }
465  else
466  error ("strrep: S must be a string or cell array of strings");
467  }
468  else if (argpat.iscell () || argrep.iscell ())
469  retval = octave::do_simple_cellfun (Fstrrep, "strrep", args);
470  else
471  error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
472 
473  return retval;
474 }
475 
476 /*
477 %!assert (strrep ("This is a test string", "is", "&%$"),
478 %! "Th&%$ &%$ a test string")
479 %!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
480 %!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
481 
482 %!assert (size (strrep ("a", "a", "")), [0 0])
483 
484 %!error strrep ()
485 %!error strrep ("foo", "bar", 3, 4)
486 */
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
T & xelem(octave_idx_type n)
Size of the specified dimension.
Definition: Array.h:469
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:377
const T * data(void) const
Size of the specified dimension.
Definition: Array.h:581
void clear(void)
Definition: Array.cc:87
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:453
const T * fortran_vec(void) const
Size of the specified dimension.
Definition: Array.h:583
Definition: Cell.h:43
Definition: dMatrix.h:42
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:95
bool iscell(void) const
Definition: ov.h:560
bool is_string(void) const
Definition: ov.h:593
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:850
Cell cell_value(void) const
bool isempty(void) const
Definition: ov.h:557
OCTINTERP_API void print_usage(void)
Definition: defun.cc:53
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void error(const char *fmt,...)
Definition: error.cc:968
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5814
F77_RET_T const F77_DBLE * x
T octave_idx_type m
Definition: mx-inlines.cc:773
octave_idx_type n
Definition: mx-inlines.cc:753
octave_value_list do_simple_cellfun(octave_value_list(*fun)(const octave_value_list &, int), const char *fun_name, const octave_value_list &args, int nargout)
Definition: utils.cc:1549
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
octave_value::octave_value(const Array< char > &chm, char type) return retval
Definition: ov.cc:811
OCTAVE_EXPORT octave_value_list Fstrfind(const octave_value_list &args, int)
Definition: strfind.cc:211
#define TABSIZE
Definition: strfind.cc:49
#define ORD(ch)
Definition: strfind.cc:48
OCTAVE_EXPORT octave_value_list Fstrrep(const octave_value_list &args, int)
Definition: strfind.cc:409
static Array< octave_idx_type > qs_search(const Array< char > &needle, const Array< char > &haystack, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:67
static Array< char > qs_replace(const Array< char > &str, const Array< char > &pat, const Array< char > &rep, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:316
static void qs_preprocess(const Array< char > &needle, octave_idx_type *table)
Definition: strfind.cc:54