diff options
Diffstat (limited to 'kjs/regexp.cpp')
| -rw-r--r-- | kjs/regexp.cpp | 231 | 
1 files changed, 109 insertions, 122 deletions
| diff --git a/kjs/regexp.cpp b/kjs/regexp.cpp index e4db2b956..04159904f 100644 --- a/kjs/regexp.cpp +++ b/kjs/regexp.cpp @@ -1,4 +1,3 @@ -// -*- c-basic-offset: 2 -*-  /*   *  This file is part of the KDE libraries   *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org) @@ -31,19 +30,19 @@  using namespace KJS; -#ifdef PCRE_CONFIG_UTF8 +#ifdef HAVE_PCRE2POSIX  RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;  #endif  RegExp::RegExp(const UString &p, int f)    : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)  { +#ifdef HAVE_PCRE2POSIX    // Determine whether libpcre has unicode support if need be.. -#ifdef PCRE_CONFIG_UTF8    if (utf8Support == Unknown) { -    int supported; -    pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); -    utf8Support = supported ? Supported : Unsupported; +    uint32_t supported; +    pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported); +    utf8Support = (supported & 0x0001) ? Supported : Unsupported;    }  #endif @@ -64,33 +63,33 @@ RegExp::RegExp(const UString &p, int f)          escape = false;          // we only care about \u          if (c == 'u') { -	  // standard unicode escape sequence looks like \uxxxx but -	  // other browsers also accept less then 4 hex digits -	  unsigned short u = 0; -	  int j = 0; -	  for (j = 0; j < 4; ++j) { -	    if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { -	      u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); -	      ++i; -	    } else { -	      // sequence incomplete. restore index. -	      // TODO: cleaner way to propagate warning -	      fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); -	      i -= j; -	      break; -	    } -	  } -	  if (j < 4) { -	    // sequence was incomplete. treat \u as u which IE always -	    // and FF sometimes does. -	    intern.append(UString('u')); -	  } else { +    // standard unicode escape sequence looks like \uxxxx but +    // other browsers also accept less then 4 hex digits +    unsigned short u = 0; +    int j = 0; +    for (j = 0; j < 4; ++j) { +      if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { +        u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); +        ++i; +      } else { +        // sequence incomplete. restore index. +        // TODO: cleaner way to propagate warning +        fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); +        i -= j; +        break; +      } +    } +    if (j < 4) { +      // sequence was incomplete. treat \u as u which IE always +      // and FF sometimes does. +      intern.append(UString('u')); +    } else {              c = UChar(u);              switch (u) {              case 0: -	      // Make sure to encode 0, to avoid terminating the string -	      intern += UString(nil); -	      break; +        // Make sure to encode 0, to avoid terminating the string +        intern += UString(nil); +        break;              case '^':              case '$':              case '\\': @@ -102,13 +101,13 @@ RegExp::RegExp(const UString &p, int f)              case '{': case '}':              case '[': case ']':              case '|': -	      // escape pattern characters have to remain escaped -	      intern.append(UString('\\')); -	      // intentional fallthrough +        // escape pattern characters have to remain escaped +        intern.append(UString('\\')); +        // intentional fallthrough              default: -	      intern += UString(&c, 1); -	      break; -	    } +        intern += UString(&c, 1); +        break; +      }            }            continue;          } @@ -127,45 +126,47 @@ RegExp::RegExp(const UString &p, int f)      intern = p;    } -#ifdef HAVE_PCREPOSIX -  int pcreflags = 0; -  const char *perrormsg; -  int errorOffset; +#ifdef HAVE_PCRE2POSIX +  uint32_t pcre2flags = 0; +  int errorCode; +  PCRE2_SIZE errorOffset;    if (flgs & IgnoreCase) -    pcreflags |= PCRE_CASELESS; +    pcre2flags |= PCRE2_CASELESS;    if (flgs & Multiline) -    pcreflags |= PCRE_MULTILINE; +    pcre2flags |= PCRE2_MULTILINE; -#ifdef PCRE_CONFIG_UTF8    if (utf8Support == Supported) -    pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); -#endif +    pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);    // Fill our buffer with an encoded version, whether utf-8, or,     // if PCRE is incapable, truncated.    prepareMatch(intern); -  pcregex = pcre_compile(buffer, pcreflags, -			 &perrormsg, &errorOffset, NULL); +  pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags, +       &errorCode, &errorOffset, NULL);    doneMatch(); // Cleanup buffers    if (!pcregex) {  #ifndef NDEBUG -    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); +    PCRE2_UCHAR errorMsg[256]; +    pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg)); +    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);  #endif +    match_data = nullptr;      valid = false;      return;    } -#ifdef PCRE_INFO_CAPTURECOUNT    // Get number of subpatterns that will be returned -  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns); +  int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);    if (rc != 0) -#endif +  {      nrSubPatterns = 0; // fallback. We always need the first pair of offsets. +  } -#else /* HAVE_PCREPOSIX */ +  match_data = pcre2_match_data_create_from_pattern(pcregex, NULL); +#else    int regflags = 0;  #ifdef REG_EXTENDED @@ -196,9 +197,15 @@ RegExp::RegExp(const UString &p, int f)  RegExp::~RegExp()  {    doneMatch(); // Be 100% sure buffers are freed -#ifdef HAVE_PCREPOSIX +#ifdef HAVE_PCRE2POSIX +  if (match_data) +  { +    pcre2_match_data_free(match_data); +  }    if (pcregex) -    pcre_free(pcregex); +  { +    pcre2_code_free(pcregex); +  }  #else    /* TODO: is this really okay after an error ? */    regfree(&preg); @@ -209,7 +216,7 @@ void RegExp::prepareUtf8(const UString& s)  {    // Allocate a buffer big enough to hold all the characters plus \0    const int length = s.size(); -  buffer = new char[length * 3 + 1]; +  buffer = new buftype_t[length * 3 + 1];    // Also create buffer for positions. We need one extra character in there,    // even past the \0 since the non-empty handling may jump one past the end @@ -218,7 +225,7 @@ void RegExp::prepareUtf8(const UString& s)    // Convert to runs of 8-bit characters, and generate indeces    // Note that we do NOT combine surrogate pairs here, as     // regexps operate on them as separate characters -  char *p      = buffer; +  buftype_t *p = buffer;    int  *posOut = originalPos;    const UChar *d = s.data();    for (int i = 0; i != length; ++i) { @@ -226,16 +233,16 @@ void RegExp::prepareUtf8(const UString& s)      int sequenceLen;      if (c < 0x80) { -      *p++ = (char)c; +      *p++ = (buftype_t)c;        sequenceLen = 1;      } else if (c < 0x800) { -      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 -      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set +      *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 +      *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set        sequenceLen = 2;      } else { -      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 -      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set -      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set +      *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 +      *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set +      *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set        sequenceLen = 3;      } @@ -263,7 +270,7 @@ void RegExp::prepareASCII (const UString& s)    // when we don't have utf 8 available -- use     // truncated version, and pray for the best     CString truncated = s.cstring(); -  buffer = new char[truncated.size() + 1]; +  buffer = new buftype_t[truncated.size() + 1];    memcpy(buffer, truncated.c_str(), truncated.size());    buffer[truncated.size()] = '\0'; // For _compile use    bufferSize = truncated.size(); @@ -273,7 +280,7 @@ void RegExp::prepareMatch(const UString &s)  {    delete[] originalPos; // Just to be sure..    delete[] buffer; -#ifdef PCRE_CONFIG_UTF8 +#ifdef HAVE_PCRE2POSIX    if (utf8Support == Supported)      prepareUtf8(s);    else @@ -309,17 +316,16 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)    if (i > s.size() || s.isNull())      return UString::null; -#ifdef HAVE_PCREPOSIX -  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu -  if (ovector) *ovector = new int[ovecsize]; -  if (!pcregex) +#ifdef HAVE_PCRE2POSIX +  if (!pcregex || !match_data) +    return UString::null; +  if (!ovector)      return UString::null;    int startPos;    int nextPos; - -#ifdef PCRE_CONFIG_UTF8 -  if (utf8Support == Supported) { +  if (utf8Support == Supported) +  {      startPos = i;      while (originalPos[startPos] < i)        ++startPos; @@ -329,53 +335,59 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)        while (originalPos[nextPos] < (i + 1))          ++nextPos;      } -  } else -#endif +  } +  else    {      startPos = i;      nextPos  = i + (i < s.size() ? 1 : 0);    } -  int baseFlags = -#ifdef PCRE_CONFIG_UTF8 -    utf8Support == Supported ? PCRE_NO_UTF8_CHECK : -#endif -    0; -  int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos, -                             m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest -                             ovector ? *ovector : 0L, ovecsize); -  if (numMatches < 0) +  uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0); +  if (m_notEmpty) +  { +    baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED; +  } +  int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL); +  if (numMatches <= 0)    {      // Failed to match. -    if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos) +    if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)      {        // We set m_notEmpty ourselves, to look for a non-empty match -      // (see man pcretest or pcretest.c for details).        // So we don't stop here, we want to try again at i+1.  #ifdef KJS_VERBOSE        fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");  #endif        m_notEmpty = 0; -      numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags, -                             ovector ? *ovector : 0L, ovecsize); -      if (numMatches < 0) +      baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0); +      numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL); +      if (numMatches <= 0)          return UString::null;      } -    else // done +    else        return UString::null;    } -  // Got a match, proceed with it. -  // But fix up the ovector if need be.. -  if (ovector && originalPos) { -    for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) { -      if ((*ovector)[c] != -1) -        (*ovector)[c] = originalPos[(*ovector)[c]]; +  PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data); +  if (!pcre2_ovector) +    return UString::null; + +  uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data); +  *ovector = new int[pcre2_ovecCount * 2]; +  if (originalPos) +  { +    for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c) +    { +      (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1; +    } +  } +  else +  { +    for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c) +    { +      (*ovector)[c] = pcre2_ovector[c];      }    } - -  if (!ovector) -    return UString::null; // don't rely on the return value if you pass ovector==0  #else    const uint maxMatch = 10;    regmatch_t rmatch[maxMatch]; @@ -420,28 +432,3 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)    }    return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);  } - -#if 0 // unused -bool RegExp::test(const UString &s, int) -{ -#ifdef HAVE_PCREPOSIX -  int ovector[300]; -  CString buffer(s.cstring()); - -  if (s.isNull() || -      pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0, -		0, ovector, 300) == PCRE_ERROR_NOMATCH) -    return false; -  else -    return true; - -#else - -  char *str = strdup(s.ascii()); -  int r = regexec(&preg, str, 0, 0, 0); -  free(str); - -  return r == 0; -#endif -} -#endif | 
