strfind.cc

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (C) 2009-2012 Jaroslav Hajek
00004 Copyright (C) 2009-2010 VZLU Prague
00005 
00006 This file is part of Octave.
00007 
00008 Octave is free software; you can redistribute it and/or modify it
00009 under the terms of the GNU General Public License as published by the
00010 Free Software Foundation; either version 3 of the License, or (at your
00011 option) any later version.
00012 
00013 Octave is distributed in the hope that it will be useful, but WITHOUT
00014 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00015 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
00016 for more details.
00017 
00018 You should have received a copy of the GNU General Public License
00019 along with Octave; see the file COPYING.  If not, see
00020 <http://www.gnu.org/licenses/>.
00021 
00022 */
00023 
00024 #ifdef HAVE_CONFIG_H
00025 #include <config.h>
00026 #endif
00027 
00028 #include <string>
00029 #include <climits>
00030 #include <algorithm>
00031 #include <deque>
00032 
00033 #include "Cell.h"
00034 #include "ov.h"
00035 #include "defun-dld.h"
00036 #include "unwind-prot.h"
00037 #include "gripes.h"
00038 #include "utils.h"
00039 
00040 // This allows safe indexing with char. In C++, char may be (and often is) signed!
00041 #define ORD(ch) static_cast<unsigned char>(ch)
00042 #define TABSIZE (UCHAR_MAX + 1)
00043 
00044 // This is the quick search algorithm, as described at
00045 // http://www-igm.univ-mlv.fr/~lecroq/string/node19.html
00046 static void
00047 qs_preprocess (const Array<char>& needle,
00048                octave_idx_type table[TABSIZE])
00049 {
00050   const char *x = needle.data ();
00051   octave_idx_type m = needle.numel ();
00052 
00053    for (octave_idx_type i = 0; i < TABSIZE; i++)
00054       table[i] = m + 1;
00055    for (octave_idx_type i = 0; i < m; i++)
00056       table[ORD(x[i])] = m - i;
00057 }
00058 
00059 
00060 static Array<octave_idx_type>
00061 qs_search (const Array<char>& needle,
00062            const Array<char>& haystack,
00063            const octave_idx_type table[TABSIZE],
00064            bool overlaps = true)
00065 {
00066   const char *x = needle.data ();
00067   octave_idx_type m = needle.numel ();
00068   const char *y = haystack.data ();
00069   octave_idx_type n = haystack.numel ();
00070 
00071   // We'll use deque because it typically has the most favorable properties for
00072   // the operation we need.
00073   std::deque<octave_idx_type> accum;
00074   if (m == 1)
00075     {
00076       // Looking for a single character.
00077       for (octave_idx_type i = 0; i < n; i++)
00078         {
00079           if (y[i] == x[0])
00080             accum.push_back (i);
00081         }
00082     }
00083   else if (m == 2)
00084     {
00085       // Two characters.
00086       if (overlaps)
00087         {
00088           for (octave_idx_type i = 0; i < n-1; i++)
00089             {
00090               if (y[i] == x[0] && y[i+1] == x[1])
00091                 accum.push_back (i);
00092             }
00093         }
00094       else
00095         {
00096           for (octave_idx_type i = 0; i < n-1; i++)
00097             {
00098               if (y[i] == x[0] && y[i+1] == x[1])
00099                 accum.push_back (i++);
00100             }
00101         }
00102     }
00103   else if (n >= m)
00104     {
00105       // General case.
00106       octave_idx_type j = 0;
00107 
00108       if (overlaps)
00109         {
00110           while (j < n - m)
00111             {
00112               if (std::equal (x, x + m, y + j))
00113                 accum.push_back (j);
00114               j += table[ORD(y[j + m])];
00115             }
00116         }
00117       else
00118         {
00119           while (j < n - m)
00120             {
00121               if (std::equal (x, x + m, y + j))
00122                 {
00123                   accum.push_back (j);
00124                   j += m;
00125                 }
00126               else
00127                 j += table[ORD(y[j + m])];
00128             }
00129         }
00130 
00131       if (j == n - m && std::equal (x, x + m, y + j))
00132         accum.push_back (j);
00133     }
00134 
00135   octave_idx_type nmatch = accum.size ();
00136   octave_idx_type one = 1;
00137   Array<octave_idx_type> result (dim_vector (std::min (one, nmatch), nmatch));
00138   octave_idx_type k = 0;
00139   for (std::deque<octave_idx_type>::const_iterator iter = accum.begin ();
00140        iter != accum.end (); iter++)
00141     {
00142       result.xelem (k++) = *iter;
00143     }
00144 
00145   return result;
00146 }
00147 
00148 DEFUN_DLD (strfind, args, ,
00149   "-*- texinfo -*-\n\
00150 @deftypefn  {Loadable Function} {@var{idx} =} strfind (@var{str}, @var{pattern})\n\
00151 @deftypefnx {Loadable Function} {@var{idx} =} strfind (@var{cellstr}, @var{pattern})\n\
00152 Search for @var{pattern} in the string @var{str} and return the\n\
00153 starting index of every such occurrence in the vector @var{idx}.\n\
00154 If there is no such occurrence, or if @var{pattern} is longer\n\
00155 than @var{str}, then @var{idx} is the empty array @code{[]}.\n\
00156 \n\
00157 If a cell array of strings @var{cellstr} is specified\n\
00158 then @var{idx} is a cell array of vectors, as specified\n\
00159 above.  Examples:\n\
00160 \n\
00161 @example\n\
00162 @group\n\
00163 strfind (\"abababa\", \"aba\")\n\
00164      @result{} [1, 3, 5]\n\
00165 \n\
00166 strfind (@{\"abababa\", \"bebebe\", \"ab\"@}, \"aba\")\n\
00167      @result{} ans =\n\
00168         @{\n\
00169           [1,1] =\n\
00170 \n\
00171              1   3   5\n\
00172 \n\
00173           [1,2] = [](1x0)\n\
00174           [1,3] = [](1x0)\n\
00175         @}\n\
00176 @end group\n\
00177 @end example\n\
00178 @seealso{findstr, strmatch, regexp, regexpi, find}\n\
00179 @end deftypefn")
00180 {
00181   octave_value retval;
00182   int nargin = args.length ();
00183   bool overlaps = true;
00184 
00185   if (nargin == 4 && args(2).is_string () && args(3).is_scalar_type ())
00186     {
00187       std::string opt = args(2).string_value ();
00188       if (opt == "overlaps")
00189         {
00190           overlaps = args(3).bool_value ();
00191           nargin = 2;
00192         }
00193       else
00194         {
00195           error ("strfind: unknown option: %s", opt.c_str ());
00196           return retval;
00197         }
00198     }
00199 
00200   if (nargin == 2)
00201     {
00202       octave_value argstr = args(0), argpat = args(1);
00203       if (argpat.is_string ())
00204         {
00205           Array<char> needle = argpat.char_array_value ();
00206           OCTAVE_LOCAL_BUFFER (octave_idx_type, table, TABSIZE);
00207           qs_preprocess (needle, table);
00208 
00209           if (argstr.is_string ())
00210             retval = octave_value (qs_search (needle, argstr.char_array_value (),
00211                                               table, overlaps),
00212                                    true, true);
00213           else if (argstr.is_cell ())
00214             {
00215               const Cell argsc = argstr.cell_value ();
00216               Cell retc (argsc.dims ());
00217               octave_idx_type ns = argsc.numel ();
00218 
00219               for (octave_idx_type i = 0; i < ns; i++)
00220                 {
00221                   octave_value argse = argsc(i);
00222                   if (argse.is_string ())
00223                     retc(i) = octave_value (qs_search (needle, argse.char_array_value (),
00224                                                        table, overlaps),
00225                                             true, true);
00226                   else
00227                     {
00228                       error ("strfind: each element of CELLSTR must be a string");
00229                       break;
00230                     }
00231                 }
00232 
00233               retval = retc;
00234             }
00235           else
00236             error ("strfind: first argument must be a string or cell array of strings");
00237         }
00238       else if (argpat.is_cell ())
00239         retval = do_simple_cellfun (Fstrfind, "strfind", args);
00240       else
00241         error ("strfind: PATTERN must be a string or cell array of strings");
00242     }
00243   else
00244     print_usage ();
00245 
00246   return retval;
00247 }
00248 
00249 /*
00250 
00251 %!error strfind ();
00252 %!error strfind ("foo", "bar", 1);
00253 %!error strfind ("foo", 100);
00254 %!error strfind (100, "foo");
00255 
00256 %!assert (strfind ("abababa", "aba"), [1, 3, 5]);
00257 %!assert (strfind ("abababa", "aba", "overlaps", false), [1, 5]);
00258 %!assert (strfind ({"abababa", "bla", "bla"}, "a"), {[1, 3, 5, 7], 3, 3});
00259 %!assert (strfind ("Linux _is_ user-friendly. It just isn't ignorant-friendly or idiot-friendly.", "friendly"), [17, 50, 68]);
00260 
00261 */
00262 
00263 static Array<char>
00264 qs_replace (const Array<char>& str, const Array<char>& pat,
00265             const Array<char>& rep,
00266             const octave_idx_type table[TABSIZE],
00267             bool overlaps = true)
00268 {
00269   Array<char> ret = str;
00270 
00271   octave_idx_type siz = str.numel (), psiz = pat.numel (), rsiz = rep.numel ();
00272 
00273   if (psiz != 0)
00274     {
00275       // Look up matches, without overlaps.
00276       const Array<octave_idx_type> idx = qs_search (pat, str, table, overlaps);
00277       octave_idx_type nidx = idx.numel ();
00278 
00279       if (nidx)
00280         {
00281           // Compute result size.
00282           octave_idx_type retsiz;
00283           if (overlaps)
00284             {
00285               retsiz = 0;
00286               // OMG. Is this the "right answer" MW always looks for, or
00287               // someone was just lazy?
00288               octave_idx_type k = 0;
00289               for (octave_idx_type i = 0; i < nidx; i++)
00290                 {
00291                   octave_idx_type j = idx(i);
00292                   if (j >= k)
00293                     retsiz += j - k;
00294                   retsiz += rsiz;
00295                   k = j + psiz;
00296                 }
00297 
00298               retsiz += siz - k;
00299             }
00300           else
00301             retsiz = siz + nidx * (rsiz - psiz);
00302 
00303           ret.clear (dim_vector (1, retsiz));
00304           const char *src = str.data (), *reps = rep.data ();
00305           char *dest = ret.fortran_vec ();
00306 
00307           octave_idx_type k = 0;
00308           for (octave_idx_type i = 0; i < nidx; i++)
00309             {
00310               octave_idx_type j = idx(i);
00311               if (j >= k)
00312                 dest = std::copy (src + k, src + j, dest);
00313               dest = std::copy (reps, reps + rsiz, dest);
00314               k = j + psiz;
00315             }
00316 
00317           std::copy (src + k, src + siz, dest);
00318         }
00319     }
00320 
00321   return ret;
00322 }
00323 
00324 DEFUN_DLD (strrep, args, ,
00325   "-*- texinfo -*-\n\
00326 @deftypefn  {Loadable Function} {} strrep (@var{s}, @var{ptn}, @var{rep})\n\
00327 @deftypefnx {Loadable Function} {} strrep (@var{s}, @var{ptn}, @var{rep}, \"overlaps\", @var{o})\n\
00328 Replace all occurrences of the substring @var{ptn} in the string @var{s}\n\
00329 with the string @var{rep} and return the result.  For example:\n\
00330 \n\
00331 @example\n\
00332 @group\n\
00333 strrep (\"This is a test string\", \"is\", \"&%$\")\n\
00334      @result{} \"Th&%$ &%$ a test string\"\n\
00335 @end group\n\
00336 @end example\n\
00337 \n\
00338 @var{s} may also be a cell array of strings, in which case the replacement is\n\
00339 done for each element and a cell array is returned.\n\
00340 @seealso{regexprep, strfind, findstr}\n\
00341 @end deftypefn")
00342 {
00343   octave_value retval;
00344   int nargin = args.length ();
00345   bool overlaps = true;
00346 
00347   if (nargin == 5 && args(3).is_string () && args(4).is_scalar_type ())
00348     {
00349       std::string opt = args(3).string_value ();
00350       if (opt == "overlaps")
00351         {
00352           overlaps = args(4).bool_value ();
00353           nargin = 3;
00354         }
00355       else
00356         {
00357           error ("strrep: unknown option: %s", opt.c_str ());
00358           return retval;
00359         }
00360     }
00361 
00362   if (nargin == 3)
00363     {
00364       octave_value argstr = args(0), argpat = args(1), argrep = args(2);
00365       if (argpat.is_string () && argrep.is_string ())
00366         {
00367           const Array<char> pat = argpat.char_array_value ();
00368           const Array<char> rep = argrep.char_array_value ();
00369 
00370           OCTAVE_LOCAL_BUFFER (octave_idx_type, table, TABSIZE);
00371           qs_preprocess (pat, table);
00372 
00373           if (argstr.is_string ())
00374             retval = qs_replace (argstr.char_array_value (), pat, rep, table, overlaps);
00375           else if (argstr.is_cell ())
00376             {
00377               const Cell argsc = argstr.cell_value ();
00378               Cell retc (argsc.dims ());
00379               octave_idx_type ns = argsc.numel ();
00380 
00381               for (octave_idx_type i = 0; i < ns; i++)
00382                 {
00383                   octave_value argse = argsc(i);
00384                   if (argse.is_string ())
00385                     retc(i) = qs_replace (argse.char_array_value (), pat, rep, table, overlaps);
00386                   else
00387                     {
00388                       error ("strrep: each element of S must be a string");
00389                       break;
00390                     }
00391                 }
00392 
00393               retval = retc;
00394             }
00395           else
00396             error ("strrep: S must be a string or cell array of strings");
00397         }
00398       else if (argpat.is_cell () || argrep.is_cell ())
00399         retval = do_simple_cellfun (Fstrrep, "strrep", args);
00400       else
00401         error ("strrep: PTN and REP arguments must be strings or cell arrays of strings");
00402     }
00403   else
00404     print_usage ();
00405 
00406   return retval;
00407 }
00408 
00409 /*
00410 
00411 %!assert(strcmp (strrep ("This is a test string", "is", "&%$"),
00412 %! "Th&%$ &%$ a test string"));
00413 %!assert(strrep ("abababc", "abab", "xyz"), "xyzxyzc");
00414 %!assert(strrep ("abababc", "abab", "xyz", "overlaps", false), "xyzabc");
00415 
00416 %!error strrep ();
00417 
00418 %!error strrep ("foo", "bar", 3, 4);
00419 
00420 */
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Defines