GNU Octave 7.1.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
lo-regexp.cc
Go to the documentation of this file.
1////////////////////////////////////////////////////////////////////////
2//
3// Copyright (C) 2002-2022 The Octave Project Developers
4//
5// See the file COPYRIGHT.md in the top-level directory of this
6// distribution or <https://octave.org/copyright/>.
7//
8// This file is part of Octave.
9//
10// Octave is free software: you can redistribute it and/or modify it
11// under the terms of the GNU General Public License as published by
12// the Free Software Foundation, either version 3 of the License, or
13// (at your option) any later version.
14//
15// Octave is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18// GNU General Public License for more details.
19//
20// You should have received a copy of the GNU General Public License
21// along with Octave; see the file COPYING. If not, see
22// <https://www.gnu.org/licenses/>.
23//
24////////////////////////////////////////////////////////////////////////
25
26#if defined (HAVE_CONFIG_H)
27# include "config.h"
28#endif
29
30#include <list>
31#include <sstream>
32#include <string>
33#include <vector>
34
35#if defined (HAVE_PCRE_H)
36# include <pcre.h>
37#elif defined (HAVE_PCRE_PCRE_H)
38# include <pcre/pcre.h>
39#endif
40
41#include "Matrix.h"
42#include "base-list.h"
43#include "lo-error.h"
44#include "oct-locbuf.h"
45#include "quit.h"
46#include "lo-regexp.h"
47#include "str-vec.h"
48#include "unistr-wrappers.h"
49
50namespace octave
51{
52 // Define the maximum number of retries for a pattern
53 // that possibly results in an infinite recursion.
54#define PCRE_MATCHLIMIT_MAX 10
55
56 // FIXME: should this be configurable?
57#define MAXLOOKBEHIND 10
58
59 static bool lookbehind_warned = false;
60
61 // FIXME: don't bother collecting and composing return values
62 // the user doesn't want.
63
64 void
66 {
67 if (m_data)
68 pcre_free (static_cast<pcre *> (m_data));
69 }
70
71 void
73 {
74 // If we had a previously compiled pattern, release it.
75 free ();
76
77 std::size_t max_length = MAXLOOKBEHIND;
78
79 std::size_t pos = 0;
80 std::size_t new_pos;
81 int inames = 0;
82 std::ostringstream buf;
83
84 while ((new_pos = m_pattern.find ("(?", pos)) != std::string::npos)
85 {
86 if (m_pattern.at (new_pos + 2) == '<'
87 && !(m_pattern.at (new_pos + 3) == '='
88 || m_pattern.at (new_pos + 3) == '!'))
89 {
90 // The syntax of named tokens in pcre is "(?P<name>...)" while
91 // we need a syntax "(?<name>...)", so fix that here. Also an
92 // expression like
93 // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
94 // should be perfectly legal, while pcre does not allow the same
95 // named token name on both sides of the alternative. Also fix
96 // that here by replacing name tokens by dummy names, and dealing
97 // with the dummy names later.
98
99 std::size_t tmp_pos = m_pattern.find_first_of ('>', new_pos);
100
101 if (tmp_pos == std::string::npos)
102 (*current_liboctave_error_handler)
103 ("regexp: syntax error in pattern");
104
105 std::string tmp_name
106 = m_pattern.substr (new_pos+3, tmp_pos-new_pos-3);
107
108 bool found = false;
109
110 for (int i = 0; i < m_names; i++)
111 {
112 if (m_named_pats(i) == tmp_name)
113 {
114 m_named_idx.resize (dim_vector (inames+1, 1));
115 m_named_idx(inames) = i;
116 found = true;
117 break;
118 }
119 }
120
121 if (! found)
122 {
123 m_named_idx.resize (dim_vector (inames+1, 1));
124 m_named_idx(inames) = m_names;
125 m_named_pats.append (tmp_name);
126 m_names++;
127 }
128
129 if (new_pos - pos > 0)
130 buf << m_pattern.substr (pos, new_pos-pos);
131 if (inames < 10)
132 buf << "(?P<n00" << inames++;
133 else if (inames < 100)
134 buf << "(?P<n0" << inames++;
135 else
136 buf << "(?P<n" << inames++;
137
138 pos = tmp_pos;
139 }
140 else if (m_pattern.at (new_pos + 2) == '<')
141 {
142 // Find lookbehind operators of arbitrary length (ie like
143 // "(?<=[a-z]*)") and replace with a maximum length operator
144 // as PCRE can not yet handle arbitrary length lookahead
145 // operators. Use the string length as the maximum length to
146 // avoid issues.
147
148 int brackets = 1;
149 std::size_t tmp_pos1 = new_pos + 2;
150 std::size_t tmp_pos2 = tmp_pos1;
151
152 while (tmp_pos1 < m_pattern.length () && brackets > 0)
153 {
154 char ch = m_pattern.at (tmp_pos1);
155
156 if (ch == '(')
157 brackets++;
158 else if (ch == ')')
159 {
160 if (brackets > 1)
161 tmp_pos2 = tmp_pos1;
162
163 brackets--;
164 }
165
166 tmp_pos1++;
167 }
168
169 if (brackets != 0)
170 {
171 buf << m_pattern.substr (pos, new_pos - pos) << "(?";
172 pos = new_pos + 2;
173 }
174 else
175 {
176 std::size_t tmp_pos3 = m_pattern.find_first_of ("*+", tmp_pos2);
177
178 if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
179 {
180 if (! lookbehind_warned)
181 {
182 lookbehind_warned = true;
183 (*current_liboctave_warning_with_id_handler)
184 ("Octave:regexp-lookbehind-limit",
185 "%s: arbitrary length lookbehind patterns are only supported up to length %d",
186 m_who.c_str (), MAXLOOKBEHIND);
187 }
188
189 buf << m_pattern.substr (pos, new_pos - pos) << '(';
190
191 std::size_t i;
192
193 if (m_pattern.at (tmp_pos3) == '*')
194 i = 0;
195 else
196 i = 1;
197
198 for (; i < max_length + 1; i++)
199 {
200 buf << m_pattern.substr (new_pos, tmp_pos3 - new_pos)
201 << '{' << i << '}';
202 buf << m_pattern.substr (tmp_pos3 + 1,
203 tmp_pos1 - tmp_pos3 - 1);
204 if (i != max_length)
205 buf << '|';
206 }
207 buf << ')';
208 }
209 else
210 buf << m_pattern.substr (pos, tmp_pos1 - pos);
211
212 pos = tmp_pos1;
213 }
214 }
215 else
216 {
217 buf << m_pattern.substr (pos, new_pos - pos) << "(?";
218 pos = new_pos + 2;
219 }
220
221 }
222
223 buf << m_pattern.substr (pos);
224
225 // Replace NULLs with escape sequence because conversion function c_str()
226 // will terminate string early at embedded NULLs.
227 std::string buf_str = buf.str ();
228 while ((pos = buf_str.find ('\0')) != std::string::npos)
229 buf_str.replace (pos, 1, "\\000");
230
231 const char *err;
232 int erroffset;
233
234 int pcre_options
235 = ( (m_options.case_insensitive () ? PCRE_CASELESS : 0)
236 | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL)
237 | (m_options.lineanchors () ? PCRE_MULTILINE : 0)
238 | (m_options.freespacing () ? PCRE_EXTENDED : 0)
239 | PCRE_UTF8);
240
241 m_data = pcre_compile (buf_str.c_str (), pcre_options,
242 &err, &erroffset, nullptr);
243
244 if (! m_data)
245 (*current_liboctave_error_handler)
246 ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset);
247 }
248
250 regexp::match (const std::string& buffer) const
251 {
252 // check if input is valid utf-8
253 const uint8_t *buf_str = reinterpret_cast<const uint8_t *> (buffer.c_str ());
254 if (octave_u8_check_wrapper (buf_str, buffer.length ()))
256 ("%s: the input string is invalid UTF-8", m_who.c_str ());
257
258 regexp::match_data retval;
259
260 std::list<regexp::match_element> lst;
261
262 int subpatterns;
263 int namecount;
264 int nameentrysize;
265 char *nametable;
266 std::size_t idx = 0;
267
268 pcre *re = static_cast<pcre *> (m_data);
269
270 pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT, &subpatterns);
271 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount);
272 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
273 pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable);
274
275 OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
276 OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
277
278 for (int i = 0; i < namecount; i++)
279 {
280 // Index of subpattern in first two bytes of name (MSB first).
281 // Extract index.
282 nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
283 | static_cast<int> (nametable[i*nameentrysize+1]);
284 }
285
286 while (true)
287 {
288 octave_quit ();
289
290 int matches = pcre_exec (re, nullptr, buffer.c_str (),
291 buffer.length (), idx,
292 PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0),
293 ovector, (subpatterns+1)*3);
294
295 if (matches == PCRE_ERROR_MATCHLIMIT)
296 {
297 // Try harder; start with default value for MATCH_LIMIT
298 // and increase it.
299 (*current_liboctave_warning_with_id_handler)
300 ("Octave:regexp-match-limit",
301 "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
302
303 pcre_extra pe;
304
305 pcre_config (PCRE_CONFIG_MATCH_LIMIT,
306 static_cast<void *> (&pe.match_limit));
307
308 pe.flags = PCRE_EXTRA_MATCH_LIMIT;
309
310 int i = 0;
311 while (matches == PCRE_ERROR_MATCHLIMIT
312 && i++ < PCRE_MATCHLIMIT_MAX)
313 {
314 octave_quit ();
315
316 pe.match_limit *= 10;
317 matches = pcre_exec (re, &pe, buffer.c_str (),
318 buffer.length (), idx,
319 PCRE_NO_UTF8_CHECK
320 | (idx ? PCRE_NOTBOL : 0),
321 ovector, (subpatterns+1)*3);
322 }
323 }
324
325 if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
326 (*current_liboctave_error_handler)
327 ("%s: internal error calling pcre_exec; "
328 "error code from pcre_exec is %i", m_who.c_str (), matches);
329
330 if (matches == PCRE_ERROR_NOMATCH)
331 break;
332 else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ())
333 {
334 // Zero length match. Skip to next char.
335 idx = ovector[0] + 1;
336 if (idx < buffer.length ())
337 continue;
338 else
339 break;
340 }
341 else
342 {
343 int pos_match = 0;
344 Matrix token_extents (matches-1, 2);
345
346 for (int i = 1; i < matches; i++)
347 {
348 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
349 && (i == 1 || ovector[2*i] != ovector[2*i-2]
350 || ovector[2*i-1] != ovector[2*i+1]))
351 {
352 token_extents(pos_match, 0) = double (ovector[2*i]+1);
353 token_extents(pos_match++, 1) = double (ovector[2*i+1]);
354 }
355 }
356
357 token_extents.resize (pos_match, 2);
358
359 double start = double (ovector[0]+1);
360 double end = double (ovector[1]);
361
362 const char **listptr;
363 int status = pcre_get_substring_list (buffer.c_str (), ovector,
364 matches, &listptr);
365
366 if (status == PCRE_ERROR_NOMEMORY)
367 (*current_liboctave_error_handler)
368 ("%s: cannot allocate memory in pcre_get_substring_list",
369 m_who.c_str ());
370
371 // Must use explicit length constructor as match can contain '\0'.
372 std::string match_string = std::string (*listptr, end - start + 1);
373
374 string_vector tokens (pos_match);
375 string_vector named_tokens (m_names);
376 int pos_offset = 0;
377 pos_match = 0;
378
379 for (int i = 1; i < matches; i++)
380 {
381 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
382 {
383 if (i == 1 || ovector[2*i] != ovector[2*i-2]
384 || ovector[2*i-1] != ovector[2*i+1])
385 {
386 if (namecount > 0)
387 {
388 // FIXME: Should probably do this with a map()
389 // rather than a linear search. However,
390 // the number of captured, named expressions
391 // is usually pretty small (< 4)
392 for (int j = 0; j < namecount; j++)
393 {
394 if (nidx[j] == i)
395 {
396 std::size_t len = ovector[2*i+1] - ovector[2*i];
397 named_tokens(m_named_idx(j))
398 = std::string (*(listptr+i-pos_offset),
399 len);
400 break;
401 }
402 }
403 }
404
405 std::size_t len = ovector[2*i+1] - ovector[2*i];
406 tokens(pos_match++) = std::string (*(listptr+i), len);
407 }
408 else
409 pos_offset++;
410 }
411 }
412
413 pcre_free_substring_list (listptr);
414
415 regexp::match_element new_elem (named_tokens, tokens, match_string,
416 token_extents, start, end);
417 lst.push_back (new_elem);
418
419 if (ovector[1] <= ovector[0])
420 {
421 // Zero length match. Skip to next char.
422 idx = ovector[0] + 1;
423 if (idx <= buffer.length ())
424 continue;
425 }
426 else
427 idx = ovector[1];
428
429 if (m_options.once () || idx >= buffer.length ())
430 break;
431 }
432 }
433
434 retval = regexp::match_data (lst, m_named_pats);
435
436 return retval;
437 }
438
439 bool
440 regexp::is_match (const std::string& buffer) const
441 {
442 regexp::match_data rx_lst = match (buffer);
443
444 return rx_lst.size () > 0;
445 }
446
448 regexp::is_match (const string_vector& buffer) const
449 {
450 octave_idx_type len = buffer.numel ();
451
452 Array<bool> retval (dim_vector (len, 1));
453
454 for (octave_idx_type i = 0; i < buffer.numel (); i++)
455 retval(i) = is_match (buffer(i));
456
457 return retval;
458 }
459
460 // Declare rep_token_t used in processing replacement string
462 {
463 std::size_t pos;
464 int num;
465 };
466
467 std::string
468 regexp::replace (const std::string& buffer,
469 const std::string& replacement) const
470 {
471 std::string retval;
472
473 const regexp::match_data rx_lst = match (buffer);
474
475 std::size_t num_matches = rx_lst.size ();
476
477 if (num_matches == 0)
478 {
479 retval = buffer;
480 return retval;
481 }
482
483 // Identify replacement tokens; build a vector of group numbers in
484 // the replacement string so that we can quickly calculate the size
485 // of the replacement.
486
487 // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
488 // $11 represents $1 followed by the character '1' rather than
489 // the eleventh capture buffer.
490
491 std::string repstr = replacement;
492 std::vector<rep_token_t> tokens;
493 tokens.reserve (5); // Reserve memory for 5 pattern replacements
494
495 for (std::size_t i=0; i < repstr.size (); i++)
496 {
497 if (repstr[i] == '\\')
498 {
499 if (i < repstr.size () - 1 && repstr[i+1] == '$')
500 {
501 repstr.erase (i, 1); // erase backslash
502 i++; // skip over '$'
503 continue;
504 }
505 if (i < repstr.size () - 1 && repstr[i+1] == '\\')
506 {
507 repstr.erase (i, 1); // erase 1st backslash
508 continue;
509 }
510 }
511 else if (repstr[i] == '$')
512 {
513 if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
514 {
515 rep_token_t tmp_token;
516
517 tmp_token.pos = i;
518 tmp_token.num = repstr[i+1]-'0';
519 tokens.push_back (tmp_token);
520 }
521 }
522 }
523
524 std::string rep;
525 int num_tokens = tokens.size ();
526
527 if (num_tokens > 0)
528 {
529 // Determine replacement length
530 const std::size_t replen = repstr.size () - 2*num_tokens;
531 int delta = 0;
532 auto p = rx_lst.begin ();
533 for (std::size_t i = 0; i < num_matches; i++)
534 {
535 octave_quit ();
536
537 double start = p->start ();
538 double end = p->end ();
539
540 const Matrix pairs (p->token_extents ());
541 std::size_t pairlen = 0;
542 for (int j = 0; j < num_tokens; j++)
543 {
544 if (tokens[j].num == 0)
545 pairlen += static_cast<std::size_t> (end - start + 1);
546 else if (tokens[j].num <= pairs.rows ())
547 pairlen += static_cast<std::size_t> (pairs(tokens[j].num-1,1)
548 - pairs(tokens[j].num-1,0)
549 + 1);
550 }
551 delta += (static_cast<int> (replen + pairlen)
552 - static_cast<int> (end - start + 1));
553 p++;
554 }
555
556 // Build replacement string
557 rep.reserve (buffer.size () + delta);
558 std::size_t from = 0;
559 p = rx_lst.begin ();
560 for (std::size_t i = 0; i < num_matches; i++)
561 {
562 octave_quit ();
563
564 double start = p->start ();
565 double end = p->end ();
566
567 const Matrix pairs (p->token_extents ());
568 rep.append (&buffer[from], static_cast<std::size_t> (start - 1 - from));
569 from = static_cast<std::size_t> (end);
570
571 std::size_t cur_pos = 0;
572
573 for (int j = 0; j < num_tokens; j++)
574 {
575 rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
576 cur_pos = tokens[j].pos+2;
577
578 int k = tokens[j].num;
579 if (k == 0)
580 {
581 // replace with entire match
582 rep.append (&buffer[static_cast<std::size_t> (end - 1)],
583 static_cast<std::size_t> (end - start + 1));
584 }
585 else if (k <= pairs.rows ())
586 {
587 // replace with group capture
588 rep.append (&buffer[static_cast<std::size_t> (pairs(k-1,0)-1)],
589 static_cast<std::size_t> (pairs(k-1,1)
590 - pairs(k-1,0) + 1));
591 }
592 else
593 {
594 // replace with nothing
595 }
596 }
597 if (cur_pos < repstr.size ())
598 rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
599
600 p++;
601 }
602 rep.append (&buffer[from], buffer.size () - from);
603 }
604 else
605 {
606 // Determine repstr length
607 const std::size_t replen = repstr.size ();
608 int delta = 0;
609 auto p = rx_lst.begin ();
610 for (std::size_t i = 0; i < num_matches; i++)
611 {
612 octave_quit ();
613
614 delta += static_cast<int> (replen)
615 - static_cast<int> (p->end () - p->start () + 1);
616 p++;
617 }
618
619 // Build replacement string
620 rep.reserve (buffer.size () + delta);
621 std::size_t from = 0;
622 p = rx_lst.begin ();
623 for (std::size_t i = 0; i < num_matches; i++)
624 {
625 octave_quit ();
626
627 rep.append (&buffer[from],
628 static_cast<std::size_t> (p->start () - 1 - from));
629 from = static_cast<std::size_t> (p->end ());
630 rep.append (repstr);
631 p++;
632 }
633 rep.append (&buffer[from], buffer.size () - from);
634 }
635
636 retval = rep;
637 return retval;
638 }
639}
octave_idx_type rows(void) const
Definition: Array.h:449
OCTARRAY_API void resize(const dim_vector &dv, const T &rfv)
Size of the specified dimension.
Definition: Array.cc:1010
Definition: dMatrix.h:42
void resize(octave_idx_type nr, octave_idx_type nc, double rfv=0)
Definition: dMatrix.h:158
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:94
std::size_t size(void) const
Definition: base-list.h:52
iterator begin(void)
Definition: base-list.h:65
void lineanchors(bool val)
Definition: lo-regexp.h:143
void case_insensitive(bool val)
Definition: lo-regexp.h:139
void emptymatch(bool val)
Definition: lo-regexp.h:141
void once(bool val)
Definition: lo-regexp.h:144
void freespacing(bool val)
Definition: lo-regexp.h:142
void dotexceptnewline(bool val)
Definition: lo-regexp.h:140
std::string m_pattern
Definition: lo-regexp.h:223
std::string replace(const std::string &buffer, const std::string &replacement) const
Definition: lo-regexp.cc:468
match_data match(const std::string &buffer) const
Definition: lo-regexp.cc:250
bool is_match(const std::string &buffer) const
Definition: lo-regexp.cc:440
string_vector m_named_pats
Definition: lo-regexp.h:230
Array< int > m_named_idx
Definition: lo-regexp.h:232
void free(void)
Definition: lo-regexp.cc:65
void * m_data
Definition: lo-regexp.h:228
void compile_internal(void)
Definition: lo-regexp.cc:72
std::string m_who
Definition: lo-regexp.h:233
string_vector & append(const std::string &s)
Definition: str-vec.cc:110
octave_idx_type numel(void) const
Definition: str-vec.h:100
OCTAVE_NORETURN liboctave_error_handler current_liboctave_error_handler
Definition: lo-error.c:41
#define MAXLOOKBEHIND
Definition: lo-regexp.cc:57
#define PCRE_MATCHLIMIT_MAX
Definition: lo-regexp.cc:54
static bool lookbehind_warned
Definition: lo-regexp.cc:59
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
std::size_t pos
Definition: lo-regexp.cc:463
const uint8_t * octave_u8_check_wrapper(const uint8_t *src, size_t n)
F77_RET_T len
Definition: xerbla.cc:61