GNU Octave 7.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
strfind.cc
Go to the documentation of this file.
1////////////////////////////////////////////////////////////////////////
2//
3// Copyright (C) 2009-2022 The Octave Project Developers
4//
5// See the file COPYRIGHT.md in the top-level directory of this
6// distribution or <https://octave.org/copyright/>.
7//
8// This file is part of Octave.
9//
10// Octave is free software: you can redistribute it and/or modify it
11// under the terms of the GNU General Public License as published by
12// the Free Software Foundation, either version 3 of the License, or
13// (at your option) any later version.
14//
15// Octave is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18// GNU General Public License for more details.
19//
20// You should have received a copy of the GNU General Public License
21// along with Octave; see the file COPYING. If not, see
22// <https://www.gnu.org/licenses/>.
23//
24////////////////////////////////////////////////////////////////////////
25
26#if defined (HAVE_CONFIG_H)
27# include "config.h"
28#endif
29
30#include <algorithm>
31#include <deque>
32#include <limits>
33#include <string>
34
35#include "oct-locbuf.h"
36
37#include "Cell.h"
38#include "builtin-defun-decls.h"
39#include "defun.h"
40#include "errwarn.h"
41#include "error.h"
42#include "ov.h"
43#include "unwind-prot.h"
44#include "utils.h"
45
46OCTAVE_NAMESPACE_BEGIN
47
48// This allows safe indexing with char.
49// In C++, char may be (and often is) signed!
50#define ORD(ch) static_cast<unsigned char>(ch)
51#define TABSIZE (std::numeric_limits<unsigned char>::max () + 1)
52
53// This is the quick search algorithm, as described at
54// http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
55static void
57 octave_idx_type *table)
58{
59 const char *x = needle.data ();
60 octave_idx_type m = needle.numel ();
61
62 for (octave_idx_type i = 0; i < TABSIZE; i++)
63 table[i] = m + 1;
64 for (octave_idx_type i = 0; i < m; i++)
65 table[ORD(x[i])] = m - i;
66}
67
69qs_search (const Array<char>& needle,
70 const Array<char>& haystack,
71 const octave_idx_type *table,
72 bool overlaps = true)
73{
74 const char *x = needle.data ();
75 octave_idx_type m = needle.numel ();
76 const char *y = haystack.data ();
77 octave_idx_type n = haystack.numel ();
78
79 // We'll use deque because it typically has the most favorable properties for
80 // the operation we need.
81 std::deque<octave_idx_type> accum;
82 if (m == 1)
83 {
84 // Looking for a single character.
85 for (octave_idx_type i = 0; i < n; i++)
86 {
87 if (y[i] == x[0])
88 accum.push_back (i);
89 }
90 }
91 else if (m == 2)
92 {
93 // Two characters.
94 if (overlaps)
95 {
96 for (octave_idx_type i = 0; i < n-1; i++)
97 {
98 if (y[i] == x[0] && y[i+1] == x[1])
99 accum.push_back (i);
100 }
101 }
102 else
103 {
104 for (octave_idx_type i = 0; i < n-1; i++)
105 {
106 if (y[i] == x[0] && y[i+1] == x[1])
107 accum.push_back (i++);
108 }
109 }
110 }
111 else if (n >= m)
112 {
113 // General case.
114 octave_idx_type j = 0;
115
116 if (overlaps)
117 {
118 while (j < n - m)
119 {
120 if (std::equal (x, x + m, y + j))
121 accum.push_back (j);
122 j += table[ORD(y[j + m])];
123 }
124 }
125 else
126 {
127 while (j < n - m)
128 {
129 if (std::equal (x, x + m, y + j))
130 {
131 accum.push_back (j);
132 j += m;
133 }
134 else
135 j += table[ORD(y[j + m])];
136 }
137 }
138
139 if (j == n - m && std::equal (x, x + m, y + j))
140 accum.push_back (j);
141 }
142
143 octave_idx_type nmatch = accum.size ();
144 octave_idx_type one = 1;
145 Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
146 octave_idx_type k = 0;
147 for (const auto& idx : accum)
148 result.xelem (k++) = idx;
149
150 return result;
151}
152
153DEFUN (strfind, args, ,
154 doc: /* -*- texinfo -*-
155@deftypefn {} {@var{idx} =} strfind (@var{str}, @var{pattern})
156@deftypefnx {} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})
157@deftypefnx {} {@var{idx} =} strfind (@dots{}, "overlaps", @var{val})
158@deftypefnx {} {@var{idx} =} strfind (@dots{}, "forcecelloutput", @var{val})
159Search for @var{pattern} in the string @var{str} and return the starting
160index of every such occurrence in the vector @var{idx}.
161
162If there is no such occurrence, or if @var{pattern} is longer than
163@var{str}, or if @var{pattern} itself is empty, then @var{idx} is the empty
164array @code{[]}.
165
166The optional argument @qcode{"overlaps"} determines whether the pattern
167can match at every position in @var{str} (true), or only for unique
168occurrences of the complete pattern (false). The default is true.
169
170If a cell array of strings @var{cellstr} is specified then @var{idx} is a
171cell array of vectors, as specified above.
172
173The optional argument @qcode{"forcecelloutput"} forces @var{idx} to be
174returned as a cell array of vectors. The default is false.
175
176Examples:
177
178@example
179@group
180strfind ("abababa", "aba")
181 @result{} [1, 3, 5]
182@end group
183
184@group
185strfind ("abababa", "aba", "overlaps", false)
186 @result{} [1, 5]
187@end group
188
189@group
190strfind (@{"abababa", "bebebe", "ab"@}, "aba")
191 @result{}
192 @{
193 [1,1] =
194
195 1 3 5
196
197 [1,2] = [](1x0)
198 [1,3] = [](1x0)
199 @}
200@end group
201
202@group
203strfind ("abababa", "aba", "forcecelloutput", true)
204 @result{}
205 @{
206 [1,1] =
207
208 1 3 5
209 @}
210@end group
211@end example
212@seealso{regexp, regexpi, find}
213@end deftypefn */)
214{
215 int nargin = args.length ();
216
217 if (nargin != 4 && nargin != 2)
218 print_usage ();
219
220 bool overlaps = true;
221 bool forcecelloutput = false;
222 if (nargin == 4)
223 {
224 if (! args(2).is_string () || ! args(3).is_scalar_type ())
225 error ("strfind: invalid optional arguments");
226
227 std::string opt = args(2).string_value ();
228 std::transform (opt.begin (), opt.end (), opt.begin (), tolower);
229
230 if (opt == "overlaps")
231 overlaps = args(3).bool_value ();
232 else if (opt == "forcecelloutput")
233 forcecelloutput = args(3).bool_value ();
234 else
235 error ("strfind: unknown option: %s", opt.c_str ());
236 }
237
238 octave_value retval;
239
240 octave_value argstr = args(0);
241 octave_value argpat = args(1);
242
243 if (argpat.is_string ())
244 {
245 Array<char> needle = argpat.char_array_value ();
247 qs_preprocess (needle, table);
248
249 if (argstr.is_string ())
250 {
251 if (argpat.isempty ())
252 // Return a null matrix for null pattern for MW compatibility
253 retval = Matrix ();
254 else
255 retval = octave_value (qs_search (needle,
256 argstr.char_array_value (),
257 table, overlaps),
258 true, true);
259 if (forcecelloutput)
260 retval = Cell (retval);
261 }
262 else if (argstr.iscell ())
263 {
264 const Cell argsc = argstr.cell_value ();
265 Cell retc (argsc.dims ());
266 octave_idx_type ns = argsc.numel ();
267
268 for (octave_idx_type i = 0; i < ns; i++)
269 {
270 octave_value argse = argsc(i);
271 if (! argse.is_string ())
272 error ("strfind: each element of CELLSTR must be a string");
273
274 if (argpat.isempty ())
275 retc(i) = Matrix ();
276 else
277 retc(i) = octave_value (qs_search (needle,
278 argse.char_array_value (),
279 table, overlaps),
280 true, true);
281 }
282
283 retval = retc;
284 }
285 else
286 error ("strfind: first argument must be a string or cell array of strings");
287 }
288 else if (argpat.iscell ())
289 retval = do_simple_cellfun (Fstrfind, "strfind", args);
290 else
291 error ("strfind: PATTERN must be a string or cell array of strings");
292
293 return retval;
294}
295
296/*
297%!assert (strfind ("abababa", "aba"), [1, 3, 5])
298%!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5])
299%!assert (strfind ("abababa", "aba", "forcecelloutput", false), [1, 3, 5])
300%!assert (strfind ("abababa", "aba", "forcecelloutput", true), {[1, 3, 5]})
301%!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3})
302%!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", false), {[1, 3, 5, 7], 3, 3})
303%!assert (strfind ({"abababa", "bla", "bla"}, "a", "forcecelloutput", true), {[1, 3, 5, 7], 3, 3})
304%!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68])
305%!assert (strfind ("abc", ""), [])
306%!assert (strfind ("abc", {"", "b", ""}), {[], 2, []})
307%!assert (strfind ({"abc", "def"}, ""), {[], []})
308
309%!error strfind ()
310%!error strfind ("foo", "bar", 1)
311%!error <unknown option: foobar> strfind ("foo", 100, "foobar", 1)
312%!error <each element of CELLSTR must be a string> strfind ({"A", 1}, "foo")
313%!error <first argument must be a string> strfind (100, "foo")
314%!error <PATTERN must be a string> strfind ("foo", 100)
315*/
316
317static Array<char>
318qs_replace (const Array<char>& str, const Array<char>& pat,
319 const Array<char>& rep,
320 const octave_idx_type *table,
321 bool overlaps = true)
322{
323 Array<char> ret = str;
324
325 octave_idx_type siz = str.numel ();
326 octave_idx_type psiz = pat.numel ();
327 octave_idx_type rsiz = rep.numel ();
328
329 if (psiz != 0)
330 {
331 // Look up matches, without overlaps.
332 const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
333 octave_idx_type nidx = idx.numel ();
334
335 if (nidx)
336 {
337 // Compute result size.
338 octave_idx_type retsiz;
339 if (overlaps)
340 {
341 retsiz = 0;
342 // OMG. Is this the "right answer" MW always looks for, or
343 // someone was just lazy?
344 octave_idx_type k = 0;
345 for (octave_idx_type i = 0; i < nidx; i++)
346 {
347 octave_idx_type j = idx(i);
348 if (j >= k)
349 retsiz += j - k;
350 retsiz += rsiz;
351 k = j + psiz;
352 }
353
354 retsiz += siz - k;
355 }
356 else
357 retsiz = siz + nidx * (rsiz - psiz);
358
359 if (retsiz == 0)
360 ret.clear (dim_vector (0, 0));
361 else
362 {
363 ret.clear (dim_vector (1, retsiz));
364 const char *src = str.data ();
365 const char *reps = rep.data ();
366 char *dest = ret.fortran_vec ();
367
368 octave_idx_type k = 0;
369 for (octave_idx_type i = 0; i < nidx; i++)
370 {
371 octave_idx_type j = idx(i);
372 if (j >= k)
373 dest = std::copy (src + k, src + j, dest);
374 dest = std::copy (reps, reps + rsiz, dest);
375 k = j + psiz;
376 }
377
378 std::copy (src + k, src + siz, dest);
379 }
380 }
381 }
382
383 return ret;
384}
385
386DEFUN (strrep, args, ,
387 doc: /* -*- texinfo -*-
388@deftypefn {} {@var{newstr} =} strrep (@var{str}, @var{ptn}, @var{rep})
389@deftypefnx {} {@var{newstr} =} strrep (@var{cellstr}, @var{ptn}, @var{rep})
390@deftypefnx {} {@var{newstr} =} strrep (@dots{}, "overlaps", @var{val})
391Replace all occurrences of the pattern @var{ptn} in the string @var{str}
392with the string @var{rep} and return the result.
393
394The optional argument @qcode{"overlaps"} determines whether the pattern
395can match at every position in @var{str} (true), or only for unique
396occurrences of the complete pattern (false). The default is true.
397
398@var{s} may also be a cell array of strings, in which case the replacement
399is done for each element and a cell array is returned.
400
401Example:
402
403@example
404@group
405strrep ("This is a test string", "is", "&%$")
406 @result{} "Th&%$ &%$ a test string"
407@end group
408@end example
409
410@seealso{regexprep, strfind}
411@end deftypefn */)
412{
413 int nargin = args.length ();
414
415 if (nargin != 3 && nargin != 5)
416 print_usage ();
417
418 bool overlaps = true;
419
420 if (nargin == 5)
421 {
422 if (! args(3).is_string () || ! args(4).is_scalar_type ())
423 error ("strrep: invalid optional argument");
424
425 std::string opt = args(3).string_value ();
426 if (opt != "overlaps")
427 error ("strrep: unknown option: %s", opt.c_str ());
428
429 overlaps = args(4).bool_value ();
430 }
431
432 octave_value retval;
433
434 // Aliasing for better code readability
435 octave_value argstr = args(0);
436 octave_value argpat = args(1);
437 octave_value argrep = args(2);
438
439 if (argpat.is_string () && argrep.is_string ())
440 {
441 const Array<char> pat = argpat.char_array_value ();
442 const Array<char> rep = argrep.char_array_value ();
443
445 qs_preprocess (pat, table);
446
447 if (argstr.is_string ())
448 if (argstr.rows () == 1) // most common case of a single string
449 retval = qs_replace (argstr.char_array_value (), pat, rep,
450 table, overlaps);
451 else
452 {
453 const charMatrix argchm = argstr.char_matrix_value ();
454 octave_idx_type nel = argchm.rows ();
455 octave_idx_type nc = argchm.columns ();
456 charMatrix retchm (nel, 0);
457
458 for (octave_idx_type i = 0; i < nel; i++)
459 {
460 charMatrix rowchm;
461 rowchm = qs_replace (argchm.extract (i, 0, i, nc-1),
462 pat, rep, table, overlaps);
463 retchm.insert (rowchm, i, 0);
464 }
465
466 retval = retchm;
467 }
468 else if (argstr.iscell ())
469 {
470 const Cell argcell = argstr.cell_value ();
471 if (! argcell.iscellstr ())
472 error ("strrep: each element of S must be a string");
473
474 Cell retcell (argcell.dims ());
475 octave_idx_type nel = argcell.numel ();
476
477 for (octave_idx_type i = 0; i < nel; i++)
478 {
479 retcell(i) = qs_replace (argcell(i).char_array_value (),
480 pat, rep, table, overlaps);
481 }
482
483 retval = retcell;
484 }
485 else
486 error ("strrep: S must be a string or cell array of strings");
487 }
488 else if (argpat.iscell () || argrep.iscell ())
489 retval = do_simple_cellfun (Fstrrep, "strrep", args);
490 else
491 error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
492
493 return retval;
494}
495
496/*
497%!assert (strrep ("This is a test string", "is", "&%$"),
498%! "Th&%$ &%$ a test string")
499%!assert (strrep ("abababc", "abab", "xyz"), "xyzxyzc")
500%!assert (strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc")
501%!assert (strrep ({"Hello World"; "Goodbye World"}, "World", "Jane"),
502%! {"Hello Jane"; "Goodbye Jane"})
503%!assert (strrep (char ("Hello World", "Goodbye World"), "World", "Jane"),
504%! char ("Hello Jane", "Goodbye Jane"))
505
506%!assert (size (strrep ("a", "a", "")), [0 0])
507
508%!error strrep ()
509%!error strrep ("A")
510%!error strrep ("A", "B")
511%!error strrep ("A", "B", "C", "D")
512%!error strrep ("A", "B", "C", "D", "E", "F")
513%!error <invalid optional argument> strrep ("A", "B", "C", 3, true)
514%!error <invalid optional argument> strrep ("A", "B", "C", "str", ones (2,2))
515%!error <unknown option: foobar> strrep ("A", "B", "C", "foobar", true)
516%!error <each element of S must be a string> strrep ({"A", 1.0}, "B", "C")
517%!error <S must be a string or cell array of strings> strrep (1.0, "B", "C")
518%!error <PTN and REP arguments must be strings> strrep ("A", 1.0, "C")
519%!error <PTN and REP arguments must be strings> strrep ("A", "B", 1.0)
520*/
521
522OCTAVE_NAMESPACE_END
charNDArray min(char d, const charNDArray &m)
Definition: chNDArray.cc:207
T & xelem(octave_idx_type n)
Size of the specified dimension.
Definition: Array.h:504
OCTARRAY_API void clear(void)
Definition: Array.cc:87
octave_idx_type numel(void) const
Number of elements in the array.
Definition: Array.h:411
const dim_vector & dims(void) const
Return a const-reference so that dims ()(i) works efficiently.
Definition: Array.h:487
octave_idx_type rows(void) const
Definition: Array.h:449
octave_idx_type columns(void) const
Definition: Array.h:458
const T * data(void) const
Size of the specified dimension.
Definition: Array.h:616
OCTARRAY_API T * fortran_vec(void)
Size of the specified dimension.
Definition: Array.cc:1744
Definition: Cell.h:43
bool iscellstr(void) const
Definition: Cell.cc:126
OCTAVE_API charMatrix extract(octave_idx_type r1, octave_idx_type c1, octave_idx_type r2, octave_idx_type c2) const
Definition: chMatrix.cc:116
OCTAVE_API charMatrix & insert(const char *s, octave_idx_type r, octave_idx_type c)
Definition: chMatrix.cc:59
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
bool iscell(void) const
Definition: ov.h:649
charMatrix char_matrix_value(bool frc_str_conv=false) const
Definition: ov.h:939
octave_idx_type rows(void) const
Definition: ov.h:590
bool is_string(void) const
Definition: ov.h:682
charNDArray char_array_value(bool frc_str_conv=false) const
Definition: ov.h:942
Cell cell_value(void) const
bool isempty(void) const
Definition: ov.h:646
OCTINTERP_API void print_usage(void)
Definition: defun-int.h:72
#define DEFUN(name, args_name, nargout_name, doc)
Macro to define a builtin function.
Definition: defun.h:56
void error(const char *fmt,...)
Definition: error.cc:980
ColumnVector transform(const Matrix &m, double x, double y, double z)
Definition: graphics.cc:5861
F77_RET_T const F77_DBLE * x
class OCTAVE_API Matrix
Definition: mx-fwd.h:31
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
return octave_value(v1.char_array_value() . concat(v2.char_array_value(), ra_idx),((a1.is_sq_string()||a2.is_sq_string()) ? '\'' :'"'))
static Array< char > qs_replace(const Array< char > &str, const Array< char > &pat, const Array< char > &rep, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:318
OCTAVE_EXPORT octave_value_list Fstrfind(const octave_value_list &args, int)
Definition: strfind.cc:213
#define TABSIZE
Definition: strfind.cc:51
static Array< octave_idx_type > qs_search(const Array< char > &needle, const Array< char > &haystack, const octave_idx_type *table, bool overlaps=true)
Definition: strfind.cc:69
#define ORD(ch)
Definition: strfind.cc:50
OCTAVE_EXPORT octave_value_list Fstrrep(const octave_value_list &args, int)
Definition: strfind.cc:411
static void qs_preprocess(const Array< char > &needle, octave_idx_type *table)
Definition: strfind.cc:56
octave_value_list do_simple_cellfun(octave_value_list(*fun)(const octave_value_list &, int), const char *fun_name, const octave_value_list &args, int nargout)
Definition: utils.cc:1688