GNU Octave  6.2.0
A high-level interpreted language, primarily intended for numerical computations, mostly compatible with Matlab
lo-regexp.cc
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (C) 2002-2021 The Octave Project Developers
4 //
5 // See the file COPYRIGHT.md in the top-level directory of this
6 // distribution or <https://octave.org/copyright/>.
7 //
8 // This file is part of Octave.
9 //
10 // Octave is free software: you can redistribute it and/or modify it
11 // under the terms of the GNU General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // Octave is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU General Public License for more details.
19 //
20 // You should have received a copy of the GNU General Public License
21 // along with Octave; see the file COPYING. If not, see
22 // <https://www.gnu.org/licenses/>.
23 //
24 ////////////////////////////////////////////////////////////////////////
25 
26 #if defined (HAVE_CONFIG_H)
27 # include "config.h"
28 #endif
29 
30 #include <list>
31 #include <sstream>
32 #include <string>
33 #include <vector>
34 
35 #if defined (HAVE_PCRE_H)
36 # include <pcre.h>
37 #elif defined (HAVE_PCRE_PCRE_H)
38 # include <pcre/pcre.h>
39 #endif
40 
41 #include "Matrix.h"
42 #include "base-list.h"
43 #include "lo-error.h"
44 #include "oct-locbuf.h"
45 #include "quit.h"
46 #include "lo-regexp.h"
47 #include "str-vec.h"
48 #include "unistr-wrappers.h"
49 
50 namespace octave
51 {
52  // Define the maximum number of retries for a pattern
53  // that possibly results in an infinite recursion.
54 #define PCRE_MATCHLIMIT_MAX 10
55 
56  // FIXME: should this be configurable?
57 #define MAXLOOKBEHIND 10
58 
59  static bool lookbehind_warned = false;
60 
61  // FIXME: don't bother collecting and composing return values
62  // the user doesn't want.
63 
64  void
65  regexp::free (void)
66  {
67  if (m_data)
68  pcre_free (static_cast<pcre *> (m_data));
69  }
70 
71  void
73  {
74  // If we had a previously compiled pattern, release it.
75  free ();
76 
77  size_t max_length = MAXLOOKBEHIND;
78 
79  size_t pos = 0;
80  size_t new_pos;
81  int inames = 0;
82  std::ostringstream buf;
83 
84  while ((new_pos = m_pattern.find ("(?", pos)) != std::string::npos)
85  {
86  if (m_pattern.at (new_pos + 2) == '<'
87  && !(m_pattern.at (new_pos + 3) == '='
88  || m_pattern.at (new_pos + 3) == '!'))
89  {
90  // The syntax of named tokens in pcre is "(?P<name>...)" while
91  // we need a syntax "(?<name>...)", so fix that here. Also an
92  // expression like
93  // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
94  // should be perfectly legal, while pcre does not allow the same
95  // named token name on both sides of the alternative. Also fix
96  // that here by replacing name tokens by dummy names, and dealing
97  // with the dummy names later.
98 
99  size_t tmp_pos = m_pattern.find_first_of ('>', new_pos);
100 
101  if (tmp_pos == std::string::npos)
102  (*current_liboctave_error_handler)
103  ("regexp: syntax error in pattern");
104 
105  std::string tmp_name
106  = m_pattern.substr (new_pos+3, tmp_pos-new_pos-3);
107 
108  bool found = false;
109 
110  for (int i = 0; i < m_names; i++)
111  {
112  if (m_named_pats(i) == tmp_name)
113  {
114  m_named_idx.resize (dim_vector (inames+1, 1));
115  m_named_idx(inames) = i;
116  found = true;
117  break;
118  }
119  }
120 
121  if (! found)
122  {
123  m_named_idx.resize (dim_vector (inames+1, 1));
124  m_named_idx(inames) = m_names;
125  m_named_pats.append (tmp_name);
126  m_names++;
127  }
128 
129  if (new_pos - pos > 0)
130  buf << m_pattern.substr (pos, new_pos-pos);
131  if (inames < 10)
132  buf << "(?P<n00" << inames++;
133  else if (inames < 100)
134  buf << "(?P<n0" << inames++;
135  else
136  buf << "(?P<n" << inames++;
137 
138  pos = tmp_pos;
139  }
140  else if (m_pattern.at (new_pos + 2) == '<')
141  {
142  // Find lookbehind operators of arbitrary length (ie like
143  // "(?<=[a-z]*)") and replace with a maximum length operator
144  // as PCRE can not yet handle arbitrary length lookahead
145  // operators. Use the string length as the maximum length to
146  // avoid issues.
147 
148  int brackets = 1;
149  size_t tmp_pos1 = new_pos + 2;
150  size_t tmp_pos2 = tmp_pos1;
151 
152  while (tmp_pos1 < m_pattern.length () && brackets > 0)
153  {
154  char ch = m_pattern.at (tmp_pos1);
155 
156  if (ch == '(')
157  brackets++;
158  else if (ch == ')')
159  {
160  if (brackets > 1)
161  tmp_pos2 = tmp_pos1;
162 
163  brackets--;
164  }
165 
166  tmp_pos1++;
167  }
168 
169  if (brackets != 0)
170  {
171  buf << m_pattern.substr (pos, new_pos - pos) << "(?";
172  pos = new_pos + 2;
173  }
174  else
175  {
176  size_t tmp_pos3 = m_pattern.find_first_of ("*+", tmp_pos2);
177 
178  if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
179  {
180  if (! lookbehind_warned)
181  {
182  lookbehind_warned = true;
183  (*current_liboctave_warning_with_id_handler)
184  ("Octave:regexp-lookbehind-limit",
185  "%s: arbitrary length lookbehind patterns are only supported up to length %d",
186  m_who.c_str (), MAXLOOKBEHIND);
187  }
188 
189  buf << m_pattern.substr (pos, new_pos - pos) << '(';
190 
191  size_t i;
192 
193  if (m_pattern.at (tmp_pos3) == '*')
194  i = 0;
195  else
196  i = 1;
197 
198  for (; i < max_length + 1; i++)
199  {
200  buf << m_pattern.substr (new_pos, tmp_pos3 - new_pos)
201  << '{' << i << '}';
202  buf << m_pattern.substr (tmp_pos3 + 1,
203  tmp_pos1 - tmp_pos3 - 1);
204  if (i != max_length)
205  buf << '|';
206  }
207  buf << ')';
208  }
209  else
210  buf << m_pattern.substr (pos, tmp_pos1 - pos);
211 
212  pos = tmp_pos1;
213  }
214  }
215  else
216  {
217  buf << m_pattern.substr (pos, new_pos - pos) << "(?";
218  pos = new_pos + 2;
219  }
220 
221  }
222 
223  buf << m_pattern.substr (pos);
224 
225  // Replace NULLs with escape sequence because conversion function c_str()
226  // will terminate string early at embedded NULLs.
227  std::string buf_str = buf.str ();
228  while ((pos = buf_str.find ('\0')) != std::string::npos)
229  buf_str.replace (pos, 1, "\\000");
230 
231  const char *err;
232  int erroffset;
233 
234  int pcre_options
235  = ( (m_options.case_insensitive () ? PCRE_CASELESS : 0)
236  | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL)
237  | (m_options.lineanchors () ? PCRE_MULTILINE : 0)
238  | (m_options.freespacing () ? PCRE_EXTENDED : 0)
239  | PCRE_UTF8);
240 
241  m_data = pcre_compile (buf_str.c_str (), pcre_options,
242  &err, &erroffset, nullptr);
243 
244  if (! m_data)
245  (*current_liboctave_error_handler)
246  ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset);
247  }
248 
250  regexp::match (const std::string& buffer)
251  {
252  // check if input is valid utf-8
253  const uint8_t *buf_str = reinterpret_cast<const uint8_t *> (buffer.c_str ());
254  if (octave_u8_check_wrapper (buf_str, buffer.length ()))
256  ("%s: the input string is invalid UTF-8", m_who.c_str ());
257 
259 
260  std::list<regexp::match_element> lst;
261 
262  int subpatterns;
263  int namecount;
264  int nameentrysize;
265  char *nametable;
266  size_t idx = 0;
267 
268  pcre *re = static_cast<pcre *> (m_data);
269 
270  pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT, &subpatterns);
271  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount);
272  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
273  pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable);
274 
275  OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
276  OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
277 
278  for (int i = 0; i < namecount; i++)
279  {
280  // Index of subpattern in first two bytes of name (MSB first).
281  // Extract index.
282  nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
283  | static_cast<int> (nametable[i*nameentrysize+1]);
284  }
285 
286  while (true)
287  {
288  octave_quit ();
289 
290  int matches = pcre_exec (re, nullptr, buffer.c_str (),
291  buffer.length (), idx,
292  PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0),
293  ovector, (subpatterns+1)*3);
294 
295  if (matches == PCRE_ERROR_MATCHLIMIT)
296  {
297  // Try harder; start with default value for MATCH_LIMIT
298  // and increase it.
299  (*current_liboctave_warning_with_id_handler)
300  ("Octave:regexp-match-limit",
301  "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
302 
303  pcre_extra pe;
304 
305  pcre_config (PCRE_CONFIG_MATCH_LIMIT,
306  static_cast<void *> (&pe.match_limit));
307 
308  pe.flags = PCRE_EXTRA_MATCH_LIMIT;
309 
310  int i = 0;
311  while (matches == PCRE_ERROR_MATCHLIMIT
312  && i++ < PCRE_MATCHLIMIT_MAX)
313  {
314  octave_quit ();
315 
316  pe.match_limit *= 10;
317  matches = pcre_exec (re, &pe, buffer.c_str (),
318  buffer.length (), idx,
319  PCRE_NO_UTF8_CHECK
320  | (idx ? PCRE_NOTBOL : 0),
321  ovector, (subpatterns+1)*3);
322  }
323  }
324 
325  if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
326  (*current_liboctave_error_handler)
327  ("%s: internal error calling pcre_exec; "
328  "error code from pcre_exec is %i", m_who.c_str (), matches);
329 
330  if (matches == PCRE_ERROR_NOMATCH)
331  break;
332  else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ())
333  {
334  // Zero length match. Skip to next char.
335  idx = ovector[0] + 1;
336  if (idx < buffer.length ())
337  continue;
338  else
339  break;
340  }
341  else
342  {
343  int pos_match = 0;
344  Matrix token_extents (matches-1, 2);
345 
346  for (int i = 1; i < matches; i++)
347  {
348  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
349  && (i == 1 || ovector[2*i] != ovector[2*i-2]
350  || ovector[2*i-1] != ovector[2*i+1]))
351  {
352  token_extents(pos_match,0) = double (ovector[2*i]+1);
353  token_extents(pos_match++,1) = double (ovector[2*i+1]);
354  }
355  }
356 
357  token_extents.resize (pos_match, 2);
358 
359  double start = double (ovector[0]+1);
360  double end = double (ovector[1]);
361 
362  const char **listptr;
363  int status = pcre_get_substring_list (buffer.c_str (), ovector,
364  matches, &listptr);
365 
366  if (status == PCRE_ERROR_NOMEMORY)
367  (*current_liboctave_error_handler)
368  ("%s: cannot allocate memory in pcre_get_substring_list",
369  m_who.c_str ());
370 
371  // Must use explicit length constructor as match can contain '\0'.
372  std::string match_string = std::string (*listptr, end - start + 1);
373 
374  string_vector tokens (pos_match);
375  string_vector named_tokens (m_names);
376  int pos_offset = 0;
377  pos_match = 0;
378 
379  for (int i = 1; i < matches; i++)
380  {
381  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
382  {
383  if (i == 1 || ovector[2*i] != ovector[2*i-2]
384  || ovector[2*i-1] != ovector[2*i+1])
385  {
386  if (namecount > 0)
387  {
388  // FIXME: Should probably do this with a map()
389  // rather than a linear search. However,
390  // the number of captured, named expressions
391  // is usually pretty small (< 4)
392  for (int j = 0; j < namecount; j++)
393  {
394  if (nidx[j] == i)
395  {
396  size_t len = ovector[2*i+1] - ovector[2*i];
397  named_tokens(m_named_idx(j))
398  = std::string (*(listptr+i-pos_offset),
399  len);
400  break;
401  }
402  }
403  }
404 
405  size_t len = ovector[2*i+1] - ovector[2*i];
406  tokens(pos_match++) = std::string (*(listptr+i), len);
407  }
408  else
409  pos_offset++;
410  }
411  }
412 
413  pcre_free_substring_list (listptr);
414 
415  regexp::match_element new_elem (named_tokens, tokens, match_string,
416  token_extents, start, end);
417  lst.push_back (new_elem);
418 
419  if (ovector[1] <= ovector[0])
420  {
421  // Zero length match. Skip to next char.
422  idx = ovector[0] + 1;
423  if (idx <= buffer.length ())
424  continue;
425  }
426  else
427  idx = ovector[1];
428 
429  if (m_options.once () || idx >= buffer.length ())
430  break;
431  }
432  }
433 
435 
436  return retval;
437  }
438 
439  bool
440  regexp::is_match (const std::string& buffer)
441  {
442  regexp::match_data rx_lst = match (buffer);
443 
444  return rx_lst.size () > 0;
445  }
446 
449  {
450  octave_idx_type len = buffer.numel ();
451 
453 
454  for (octave_idx_type i = 0; i < buffer.numel (); i++)
455  retval(i) = is_match (buffer(i));
456 
457  return retval;
458  }
459 
460  // Declare rep_token_t used in processing replacement string
461  typedef struct
462  {
463  size_t pos;
464  int num;
465  } rep_token_t;
466 
467  std::string
468  regexp::replace (const std::string& buffer, const std::string& replacement)
469  {
470  std::string retval;
471 
472  const regexp::match_data rx_lst = match (buffer);
473 
474  size_t num_matches = rx_lst.size ();
475 
476  if (num_matches == 0)
477  {
478  retval = buffer;
479  return retval;
480  }
481 
482  // Identify replacement tokens; build a vector of group numbers in
483  // the replacement string so that we can quickly calculate the size
484  // of the replacement.
485 
486  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
487  // $11 represents $1 followed by the character '1' rather than
488  // the eleventh capture buffer.
489 
490  std::string repstr = replacement;
491  std::vector<rep_token_t> tokens;
492  tokens.reserve (5); // Reserve memory for 5 pattern replacements
493 
494  for (size_t i=0; i < repstr.size (); i++)
495  {
496  if (repstr[i] == '\\')
497  {
498  if (i < repstr.size () - 1 && repstr[i+1] == '$')
499  {
500  repstr.erase (i,1); // erase backslash
501  i++; // skip over '$'
502  continue;
503  }
504  if (i < repstr.size () - 1 && repstr[i+1] == '\\')
505  {
506  repstr.erase (i,1); // erase 1st backslash
507  continue;
508  }
509  }
510  else if (repstr[i] == '$')
511  {
512  if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
513  {
514  rep_token_t tmp_token;
515 
516  tmp_token.pos = i;
517  tmp_token.num = repstr[i+1]-'0';
518  tokens.push_back (tmp_token);
519  }
520  }
521  }
522 
523  std::string rep;
524  int num_tokens = tokens.size ();
525 
526  if (num_tokens > 0)
527  {
528  // Determine replacement length
529  const size_t replen = repstr.size () - 2*num_tokens;
530  int delta = 0;
531  auto p = rx_lst.begin ();
532  for (size_t i = 0; i < num_matches; i++)
533  {
534  octave_quit ();
535 
536  double start = p->start ();
537  double end = p->end ();
538 
539  const Matrix pairs (p->token_extents ());
540  size_t pairlen = 0;
541  for (int j = 0; j < num_tokens; j++)
542  {
543  if (tokens[j].num == 0)
544  pairlen += static_cast<size_t> (end - start + 1);
545  else if (tokens[j].num <= pairs.rows ())
546  pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
547  - pairs(tokens[j].num-1,0)
548  + 1);
549  }
550  delta += (static_cast<int> (replen + pairlen)
551  - static_cast<int> (end - start + 1));
552  p++;
553  }
554 
555  // Build replacement string
556  rep.reserve (buffer.size () + delta);
557  size_t from = 0;
558  p = rx_lst.begin ();
559  for (size_t i = 0; i < num_matches; i++)
560  {
561  octave_quit ();
562 
563  double start = p->start ();
564  double end = p->end ();
565 
566  const Matrix pairs (p->token_extents ());
567  rep.append (&buffer[from], static_cast<size_t> (start - 1 - from));
568  from = static_cast<size_t> (end);
569 
570  size_t cur_pos = 0;
571 
572  for (int j = 0; j < num_tokens; j++)
573  {
574  rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
575  cur_pos = tokens[j].pos+2;
576 
577  int k = tokens[j].num;
578  if (k == 0)
579  {
580  // replace with entire match
581  rep.append (&buffer[static_cast<size_t> (end - 1)],
582  static_cast<size_t> (end - start + 1));
583  }
584  else if (k <= pairs.rows ())
585  {
586  // replace with group capture
587  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
588  static_cast<size_t> (pairs(k-1,1)
589  - pairs(k-1,0) + 1));
590  }
591  else
592  {
593  // replace with nothing
594  }
595  }
596  if (cur_pos < repstr.size ())
597  rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
598 
599  p++;
600  }
601  rep.append (&buffer[from], buffer.size () - from);
602  }
603  else
604  {
605  // Determine repstr length
606  const size_t replen = repstr.size ();
607  int delta = 0;
608  auto p = rx_lst.begin ();
609  for (size_t i = 0; i < num_matches; i++)
610  {
611  octave_quit ();
612 
613  delta += static_cast<int> (replen)
614  - static_cast<int> (p->end () - p->start () + 1);
615  p++;
616  }
617 
618  // Build replacement string
619  rep.reserve (buffer.size () + delta);
620  size_t from = 0;
621  p = rx_lst.begin ();
622  for (size_t i = 0; i < num_matches; i++)
623  {
624  octave_quit ();
625 
626  rep.append (&buffer[from],
627  static_cast<size_t> (p->start () - 1 - from));
628  from = static_cast<size_t> (p->end ());
629  rep.append (repstr);
630  p++;
631  }
632  rep.append (&buffer[from], buffer.size () - from);
633  }
634 
635  retval = rep;
636  return retval;
637  }
638 }
void resize(const dim_vector &dv, const T &rfv)
Size of the specified dimension.
Definition: Array.cc:1011
octave_idx_type rows(void) const
Definition: Array.h:415
Definition: dMatrix.h:42
void resize(octave_idx_type nr, octave_idx_type nc, double rfv=0)
Definition: dMatrix.h:151
Vector representing the dimensions (size) of an Array.
Definition: dim-vector.h:95
size_t size(void) const
Definition: base-list.h:52
iterator begin(void)
Definition: base-list.h:65
void lineanchors(bool val)
Definition: lo-regexp.h:143
void case_insensitive(bool val)
Definition: lo-regexp.h:139
void emptymatch(bool val)
Definition: lo-regexp.h:141
void once(bool val)
Definition: lo-regexp.h:144
void freespacing(bool val)
Definition: lo-regexp.h:142
void dotexceptnewline(bool val)
Definition: lo-regexp.h:140
std::string m_pattern
Definition: lo-regexp.h:223
match_data match(const std::string &buffer)
Definition: lo-regexp.cc:250
bool is_match(const std::string &buffer)
Definition: lo-regexp.cc:440
string_vector m_named_pats
Definition: lo-regexp.h:230
std::string replace(const std::string &buffer, const std::string &replacement)
Definition: lo-regexp.cc:468
Array< int > m_named_idx
Definition: lo-regexp.h:232
void free(void)
Definition: lo-regexp.cc:65
void * m_data
Definition: lo-regexp.h:228
void compile_internal(void)
Definition: lo-regexp.cc:72
std::string m_who
Definition: lo-regexp.h:233
string_vector & append(const std::string &s)
Definition: str-vec.cc:110
octave_idx_type numel(void) const
Definition: str-vec.h:100
OCTAVE_NORETURN liboctave_error_handler current_liboctave_error_handler
Definition: lo-error.c:41
#define MAXLOOKBEHIND
Definition: lo-regexp.cc:57
#define PCRE_MATCHLIMIT_MAX
Definition: lo-regexp.cc:54
static bool lookbehind_warned
Definition: lo-regexp.cc:59
#define OCTAVE_LOCAL_BUFFER(T, buf, size)
Definition: oct-locbuf.h:44
octave_value::octave_value(const Array< char > &chm, char type) return retval
Definition: ov.cc:811
const uint8_t * octave_u8_check_wrapper(const uint8_t *src, size_t n)
F77_RET_T len
Definition: xerbla.cc:61