00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifdef HAVE_CONFIG_H
00026 #include <config.h>
00027 #endif
00028
00029 #include <list>
00030 #include <sstream>
00031 #include <string>
00032 #include <vector>
00033
00034 #if defined (HAVE_PCRE_H)
00035 #include <pcre.h>
00036 #elif defined (HAVE_PCRE_PCRE_H)
00037 #include <pcre/pcre.h>
00038 #endif
00039
00040 #include "Matrix.h"
00041 #include "base-list.h"
00042 #include "lo-error.h"
00043 #include "oct-locbuf.h"
00044 #include "quit.h"
00045 #include "regexp.h"
00046 #include "str-vec.h"
00047
00048
00049
00050 #define PCRE_MATCHLIMIT_MAX 10
00051
00052
00053 #define MAXLOOKBEHIND 10
00054
00055 static bool lookbehind_warned = false;
00056
00057
00058
00059
00060 void
00061 regexp::free (void)
00062 {
00063 if (data)
00064 pcre_free (static_cast<pcre *> (data));
00065 }
00066
00067 void
00068 regexp::compile_internal (void)
00069 {
00070
00071 free ();
00072
00073 size_t max_length = MAXLOOKBEHIND;
00074
00075 size_t pos = 0;
00076 size_t new_pos;
00077 int inames = 0;
00078 std::ostringstream buf;
00079
00080 while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
00081 {
00082 if (pattern.at (new_pos + 2) == '<'
00083 && !(pattern.at (new_pos + 3) == '='
00084 || pattern.at (new_pos + 3) == '!'))
00085 {
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095 size_t tmp_pos = pattern.find_first_of ('>', new_pos);
00096
00097 if (tmp_pos == std::string::npos)
00098 {
00099 (*current_liboctave_error_handler)
00100 ("regexp: syntax error in pattern");
00101 return;
00102 }
00103
00104 std::string tmp_name =
00105 pattern.substr (new_pos+3, tmp_pos-new_pos-3);
00106
00107 bool found = false;
00108
00109 for (int i = 0; i < nnames; i++)
00110 {
00111 if (named_pats(i) == tmp_name)
00112 {
00113 named_idx.resize (dim_vector (inames+1, 1));
00114 named_idx(inames) = i;
00115 found = true;
00116 break;
00117 }
00118 }
00119
00120 if (! found)
00121 {
00122 named_idx.resize (dim_vector (inames+1, 1));
00123 named_idx(inames) = nnames;
00124 named_pats.append (tmp_name);
00125 nnames++;
00126 }
00127
00128 if (new_pos - pos > 0)
00129 buf << pattern.substr (pos, new_pos-pos);
00130 if (inames < 10)
00131 buf << "(?P<n00" << inames++;
00132 else if (inames < 100)
00133 buf << "(?P<n0" << inames++;
00134 else
00135 buf << "(?P<n" << inames++;
00136
00137 pos = tmp_pos;
00138 }
00139 else if (pattern.at (new_pos + 2) == '<')
00140 {
00141
00142
00143
00144
00145
00146
00147 int brackets = 1;
00148 size_t tmp_pos1 = new_pos + 2;
00149 size_t tmp_pos2 = tmp_pos1;
00150
00151 while (tmp_pos1 < pattern.length () && brackets > 0)
00152 {
00153 char ch = pattern.at (tmp_pos1);
00154
00155 if (ch == '(')
00156 brackets++;
00157 else if (ch == ')')
00158 {
00159 if (brackets > 1)
00160 tmp_pos2 = tmp_pos1;
00161
00162 brackets--;
00163 }
00164
00165 tmp_pos1++;
00166 }
00167
00168 if (brackets != 0)
00169 {
00170 buf << pattern.substr (pos, new_pos - pos) << "(?";
00171 pos = new_pos + 2;
00172 }
00173 else
00174 {
00175 size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
00176
00177 if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
00178 {
00179 if (!lookbehind_warned)
00180 {
00181 lookbehind_warned = true;
00182 (*current_liboctave_warning_handler)
00183 ("%s: arbitrary length lookbehind patterns are only supported up to length %d",
00184 who.c_str (), MAXLOOKBEHIND);
00185 }
00186
00187 buf << pattern.substr (pos, new_pos - pos) << "(";
00188
00189 size_t i;
00190
00191 if (pattern.at (tmp_pos3) == '*')
00192 i = 0;
00193 else
00194 i = 1;
00195
00196 for (; i < max_length + 1; i++)
00197 {
00198 buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
00199 << "{" << i << "}";
00200 buf << pattern.substr (tmp_pos3 + 1,
00201 tmp_pos1 - tmp_pos3 - 1);
00202 if (i != max_length)
00203 buf << "|";
00204 }
00205 buf << ")";
00206 }
00207 else
00208 buf << pattern.substr (pos, tmp_pos1 - pos);
00209
00210 pos = tmp_pos1;
00211 }
00212 }
00213 else
00214 {
00215 buf << pattern.substr (pos, new_pos - pos) << "(?";
00216 pos = new_pos + 2;
00217 }
00218
00219 }
00220
00221 buf << pattern.substr (pos);
00222
00223 const char *err;
00224 int erroffset;
00225 std::string buf_str = buf.str ();
00226
00227 int pcre_options
00228 = ((options.case_insensitive () ? PCRE_CASELESS : 0)
00229 | (options.dotexceptnewline () ? 0 : PCRE_DOTALL)
00230 | (options.lineanchors () ? PCRE_MULTILINE : 0)
00231 | (options.freespacing () ? PCRE_EXTENDED : 0));
00232
00233 data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0);
00234
00235 if (! data)
00236 (*current_liboctave_error_handler)
00237 ("%s: %s at position %d of expression", who.c_str (),
00238 err, erroffset);
00239 }
00240
00241 regexp::match_data
00242 regexp::match (const std::string& buffer)
00243 {
00244 regexp::match_data retval;
00245
00246 std::list<regexp::match_element> lst;
00247
00248 int subpatterns;
00249 int namecount;
00250 int nameentrysize;
00251 char *nametable;
00252 size_t idx = 0;
00253
00254 pcre *re = static_cast <pcre *> (data);
00255
00256 pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns);
00257 pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount);
00258 pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
00259 pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable);
00260
00261 OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
00262 OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
00263
00264 for (int i = 0; i < namecount; i++)
00265 {
00266
00267
00268 nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
00269 | static_cast<int> (nametable[i*nameentrysize+1]);
00270 }
00271
00272 while (true)
00273 {
00274 OCTAVE_QUIT;
00275
00276 int matches = pcre_exec (re, 0, buffer.c_str (),
00277 buffer.length (), idx,
00278 (idx ? PCRE_NOTBOL : 0),
00279 ovector, (subpatterns+1)*3);
00280
00281 if (matches == PCRE_ERROR_MATCHLIMIT)
00282 {
00283
00284
00285 (*current_liboctave_warning_handler)
00286 ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
00287
00288 pcre_extra pe;
00289
00290 pcre_config (PCRE_CONFIG_MATCH_LIMIT,
00291 static_cast <void *> (&pe.match_limit));
00292
00293 pe.flags = PCRE_EXTRA_MATCH_LIMIT;
00294
00295 int i = 0;
00296 while (matches == PCRE_ERROR_MATCHLIMIT
00297 && i++ < PCRE_MATCHLIMIT_MAX)
00298 {
00299 OCTAVE_QUIT;
00300
00301 pe.match_limit *= 10;
00302 matches = pcre_exec (re, &pe, buffer.c_str (),
00303 buffer.length (), idx,
00304 (idx ? PCRE_NOTBOL : 0),
00305 ovector, (subpatterns+1)*3);
00306 }
00307 }
00308
00309 if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
00310 {
00311 (*current_liboctave_error_handler)
00312 ("%s: internal error calling pcre_exec; error code from pcre_exec is %i",
00313 who.c_str (), matches);
00314 return retval;
00315 }
00316 else if (matches == PCRE_ERROR_NOMATCH)
00317 break;
00318 else if (ovector[1] <= ovector[0])
00319 {
00320
00321 idx = ovector[0] + 1;
00322 if (idx < buffer.length ())
00323 continue;
00324 else
00325 break;
00326 }
00327 else
00328 {
00329 int pos_match = 0;
00330 Matrix token_extents (matches-1, 2);
00331
00332 for (int i = 1; i < matches; i++)
00333 {
00334 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
00335 && (i == 1 || ovector[2*i] != ovector[2*i-2]
00336 || ovector[2*i-1] != ovector[2*i+1])
00337 && ovector[2*i] >= 0 && ovector[2*i+1] > 0)
00338 {
00339 token_extents(pos_match,0) = double (ovector[2*i]+1);
00340 token_extents(pos_match++,1) = double (ovector[2*i+1]);
00341 }
00342 }
00343
00344 token_extents.resize (pos_match, 2);
00345
00346 double start = double (ovector[0]+1);
00347 double end = double (ovector[1]);
00348
00349 const char **listptr;
00350 int status = pcre_get_substring_list (buffer.c_str (), ovector,
00351 matches, &listptr);
00352
00353 if (status == PCRE_ERROR_NOMEMORY)
00354 {
00355 (*current_liboctave_error_handler)
00356 ("%s: cannot allocate memory in pcre_get_substring_list",
00357 who.c_str ());
00358 return retval;
00359 }
00360
00361 string_vector tokens (pos_match);
00362 string_vector named_tokens (nnames);
00363 int pos_offset = 0;
00364 pos_match = 0;
00365
00366 for (int i = 1; i < matches; i++)
00367 {
00368 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
00369 {
00370 if (i == 1 || ovector[2*i] != ovector[2*i-2]
00371 || ovector[2*i-1] != ovector[2*i+1])
00372 {
00373 if (namecount > 0)
00374 {
00375
00376
00377
00378
00379 for (int j = 0; j < namecount; j++)
00380 {
00381 if (nidx[j] == i)
00382 {
00383 named_tokens(named_idx(j)) =
00384 std::string (*(listptr+i-pos_offset));
00385 break;
00386 }
00387 }
00388 }
00389
00390 tokens(pos_match++) = std::string (*(listptr+i));
00391 }
00392 else
00393 pos_offset++;
00394 }
00395 }
00396
00397 std::string match_string = std::string (*listptr);
00398
00399 pcre_free_substring_list (listptr);
00400
00401 regexp::match_element new_elem (named_tokens, tokens, match_string,
00402 token_extents, start, end);
00403 lst.push_back (new_elem);
00404 idx = ovector[1];
00405
00406 if (options.once () || idx >= buffer.length ())
00407 break;
00408 }
00409 }
00410
00411 retval = regexp::match_data (lst, named_pats);
00412
00413 return retval;
00414 }
00415
00416 bool
00417 regexp::is_match (const std::string& buffer)
00418 {
00419 regexp::match_data rx_lst = match (buffer);
00420
00421 regexp::match_data::const_iterator p = rx_lst.begin ();
00422
00423 std::string match_string = p->match_string ();
00424
00425 return ! match_string.empty ();
00426 }
00427
00428 Array<bool>
00429 regexp::is_match (const string_vector& buffer)
00430 {
00431 octave_idx_type len = buffer.length ();
00432
00433 Array<bool> retval (dim_vector (len, 1));
00434
00435 for (octave_idx_type i = 0; i < buffer.length (); i++)
00436 retval(i) = is_match (buffer(i));
00437
00438 return retval;
00439 }
00440
00441 std::string
00442 regexp::replace (const std::string& buffer, const std::string& replacement)
00443 {
00444 std::string retval;
00445
00446
00447
00448
00449
00450 int tokens = 0;
00451 for (size_t i=1; i < replacement.size (); i++)
00452 {
00453 if (replacement[i-1]=='$' && isdigit (replacement[i]))
00454 {
00455 tokens++;
00456 i++;
00457 }
00458 }
00459 std::vector<int> token (tokens);
00460
00461 int kk = 0;
00462 for (size_t i = 1; i < replacement.size (); i++)
00463 {
00464 if (replacement[i-1]=='$' && isdigit (replacement[i]))
00465 {
00466 token[kk++] = replacement[i]-'0';
00467 i++;
00468 }
00469 }
00470
00471 regexp::match_data rx_lst = match (buffer);
00472
00473 size_t sz = rx_lst.size ();
00474
00475 if (sz == 0)
00476 {
00477 retval = buffer;
00478 return retval;
00479 }
00480
00481 std::string rep;
00482
00483 if (tokens > 0)
00484 {
00485
00486 const size_t replen = replacement.size () - 2*tokens;
00487 int delta = 0;
00488 regexp::match_data::const_iterator p = rx_lst.begin ();
00489 for (size_t i = 0; i < sz; i++)
00490 {
00491 OCTAVE_QUIT;
00492
00493 double start = p->start ();
00494 double end = p->end ();
00495
00496 const Matrix pairs (p->token_extents ());
00497 size_t pairlen = 0;
00498 for (int j = 0; j < tokens; j++)
00499 {
00500 if (token[j] == 0)
00501 pairlen += static_cast<size_t> (end - start) + 1;
00502 else if (token[j] <= pairs.rows ())
00503 pairlen += static_cast<size_t> (pairs(token[j]-1,1)
00504 - pairs(token[j]-1,0)) + 1;
00505 }
00506 delta += (static_cast<int> (replen + pairlen)
00507 - static_cast<int> (end - start + 1));
00508 p++;
00509 }
00510
00511
00512 rep.reserve (buffer.size () + delta);
00513 size_t from = 0;
00514 p = rx_lst.begin ();
00515 for (size_t i = 0; i < sz; i++)
00516 {
00517 OCTAVE_QUIT;
00518
00519 double start = p->start ();
00520 double end = p->end ();
00521
00522 const Matrix pairs (p->token_extents ());
00523 rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
00524 from = static_cast<size_t> (end - 1) + 1;
00525
00526 for (size_t j = 1; j < replacement.size (); j++)
00527 {
00528 if (replacement[j-1]=='$' && isdigit (replacement[j]))
00529 {
00530 int k = replacement[j]-'0';
00531 if (k == 0)
00532 {
00533
00534 rep.append (&buffer[static_cast<size_t> (end - 1)],
00535 static_cast<size_t> (end - start) + 1);
00536 }
00537 else if (k <= pairs.rows ())
00538 {
00539
00540 rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
00541 static_cast<size_t> (pairs(k-1,1)
00542 - pairs(k-1,0)) + 1);
00543 }
00544 else
00545 {
00546
00547 }
00548 j++;
00549 }
00550 else
00551 rep.append (1, replacement[j-1]);
00552
00553 if (j+1 == replacement.size ())
00554 rep.append (1, replacement[j]);
00555 }
00556 p++;
00557 }
00558 rep.append (&buffer[from], buffer.size () - from);
00559 }
00560 else
00561 {
00562
00563 const size_t replen = replacement.size ();
00564 int delta = 0;
00565 regexp::match_data::const_iterator p = rx_lst.begin ();
00566 for (size_t i = 0; i < sz; i++)
00567 {
00568 OCTAVE_QUIT;
00569 delta += static_cast<int> (replen)
00570 - static_cast<int> (p->end () - p->start () + 1);
00571 p++;
00572 }
00573
00574
00575 rep.reserve (buffer.size () + delta);
00576 size_t from = 0;
00577 p = rx_lst.begin ();
00578 for (size_t i = 0; i < sz; i++)
00579 {
00580 OCTAVE_QUIT;
00581 rep.append (&buffer[from],
00582 static_cast<size_t> (p->start () - 1) - from);
00583 from = static_cast<size_t> (p->end () - 1) + 1;
00584 rep.append (replacement);
00585 p++;
00586 }
00587 rep.append (&buffer[from], buffer.size () - from);
00588 }
00589
00590 retval = rep;
00591 return retval;
00592 }