Commit 274adcb8 authored by Corentin Jabot's avatar Corentin Jabot Committed by Aaron Ballman
Browse files

Implement delimited escape sequences.

\x{XXXX} \u{XXXX} and \o{OOOO} are accepted in all languages mode
in characters and string literals.

This is a feature proposed for both C++ (P2290R1) and C (N2785). The
papers have been seen by both committees but are not yet adopted into
either standard. However, they do have support from both committees.
parent bbca392a
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -127,6 +127,15 @@ def warn_utf8_symbol_zero_width : Warning<
  "identifier contains Unicode character <U+%0> that is invisible in "
  "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;

def ext_delimited_escape_sequence : Extension<
  "delimited escape sequences are a Clang extension">,
  InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
def err_delimited_escape_empty : Error<
  "delimited escape sequence cannot be empty">;
def err_delimited_escape_missing_brace: Error<
  "expected '{' after '\\%0' escape sequence">;
def err_delimited_escape_invalid : Error<
  "invalid digit '%0' in escape sequence">;
def err_hex_escape_no_digits : Error<
  "\\%0 used with no following hex digits">;
def warn_ucn_escape_no_digits : Warning<
@@ -134,6 +143,12 @@ def warn_ucn_escape_no_digits : Warning<
  "treating as '\\' followed by identifier">, InGroup<Unicode>;
def err_ucn_escape_incomplete : Error<
  "incomplete universal character name">;
def warn_delimited_ucn_incomplete : Warning<
  "incomplete delimited universal character name; "
  "treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
def warn_delimited_ucn_empty : Warning<
  "empty delimited universal character name; "
  "treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
def warn_ucn_escape_incomplete : Warning<
  "incomplete universal character name; "
  "treating as '\\' followed by identifier">, InGroup<Unicode>;
+55 −20
Original line number Diff line number Diff line
@@ -3112,6 +3112,10 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
                           Token *Result) {
  unsigned CharSize;
  char Kind = getCharAndSize(StartPtr, CharSize);
  bool Delimited = false;
  bool FoundEndDelimiter = false;
  unsigned Count = 0;
  bool Diagnose = Result && !isLexingRawMode();

  unsigned NumHexDigits;
  if (Kind == 'u')
@@ -3122,7 +3126,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
    return 0;

  if (!LangOpts.CPlusPlus && !LangOpts.C99) {
    if (Result && !isLexingRawMode())
    if (Diagnose)
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
    return 0;
  }
@@ -3131,39 +3135,70 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
  const char *KindLoc = &CurPtr[-1];

  uint32_t CodePoint = 0;
  for (unsigned i = 0; i < NumHexDigits; ++i) {
  while (Count != NumHexDigits || Delimited) {
    char C = getCharAndSize(CurPtr, CharSize);
    if (!Delimited && C == '{') {
      Delimited = true;
      CurPtr += CharSize;
      continue;
    }

    if (Delimited && C == '}') {
      CurPtr += CharSize;
      FoundEndDelimiter = true;
      break;
    }

    unsigned Value = llvm::hexDigitValue(C);
    if (Value == -1U) {
      if (Result && !isLexingRawMode()) {
        if (i == 0) {
          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
      if (!Delimited)
        break;
      if (Diagnose)
        Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
            << StringRef(&C, 1);
      return 0;
    }

    if (CodePoint & 0xF000'0000) {
      if (Diagnose)
        Diag(KindLoc, diag::err_escape_too_large) << 0;
      return 0;
    }

    CodePoint <<= 4;
    CodePoint |= Value;
    CurPtr += CharSize;
    Count++;
  }

  if (Count == 0) {
    if (Diagnose)
      Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
                                       : diag::warn_ucn_escape_no_digits)
          << StringRef(KindLoc, 1);
        } else {
          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
    return 0;
  }

  if (!Delimited && Count != NumHexDigits) {
    if (Diagnose) {
      Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
      // If the user wrote \U1234, suggest a fixit to \u.
          if (i == 4 && NumHexDigits == 8) {
      if (Count == 4 && NumHexDigits == 8) {
        CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
        Diag(KindLoc, diag::note_ucn_four_not_eight)
            << FixItHint::CreateReplacement(URange, "u");
      }
    }
      }

    return 0;
  }

    CodePoint <<= 4;
    CodePoint += Value;

    CurPtr += CharSize;
  if (Delimited && PP) {
    Diag(BufferPtr, diag::ext_delimited_escape_sequence);
  }

  if (Result) {
    Result->setFlag(Token::HasUCN);
    if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
    if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
      StartPtr = CurPtr;
    else
      while (StartPtr != CurPtr)
+181 −20
Original line number Diff line number Diff line
@@ -95,6 +95,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
                                  DiagnosticsEngine *Diags,
                                  const LangOptions &Features) {
  const char *EscapeBegin = ThisTokBuf;
  bool Delimited = false;
  bool EndDelimiterFound = false;

  // Skip the '\' char.
  ++ThisTokBuf;
@@ -143,26 +145,47 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
    break;
  case 'x': { // Hex escape.
    ResultChar = 0;
    if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
    if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
      Delimited = true;
      ThisTokBuf++;
      if (*ThisTokBuf == '}') {
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
             diag::err_delimited_escape_empty);
        return ResultChar;
      }
    } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
      if (Diags)
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
             diag::err_hex_escape_no_digits) << "x";
      HadError = true;
      break;
      return ResultChar;
    }

    // Hex escapes are a maximal series of hex digits.
    bool Overflow = false;
    for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
      int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
      if (CharVal == -1) break;
      if (Delimited && *ThisTokBuf == '}') {
        ThisTokBuf++;
        EndDelimiterFound = true;
        break;
      }
      int CharVal = llvm::hexDigitValue(*ThisTokBuf);
      if (CharVal == -1) {
        // Non delimited hex escape sequences stop at the first non-hex digit.
        if (!Delimited)
          break;
        HadError = true;
        if (Diags)
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
               diag::err_delimited_escape_invalid)
              << StringRef(ThisTokBuf, 1);
        continue;
      }
      // About to shift out a digit?
      if (ResultChar & 0xF0000000)
        Overflow = true;
      ResultChar <<= 4;
      ResultChar |= CharVal;
    }

    // See if any bits will be truncated when evaluated as a character.
    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
      Overflow = true;
@@ -170,9 +193,13 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
    }

    // Check for overflow.
    if (Overflow && Diags)   // Too many digits to fit in
    if (!HadError && Overflow) { // Too many digits to fit in
      HadError = true;
      if (Diags)
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
           diag::err_escape_too_large) << 0;
             diag::err_escape_too_large)
            << 0;
    }
    break;
  }
  case '0': case '1': case '2': case '3':
@@ -200,7 +227,58 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
    }
    break;
  }
  case 'o': {
    bool Overflow = false;
    if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
      HadError = true;
      if (Diags)
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
             diag::err_delimited_escape_missing_brace);

      break;
    }
    ResultChar = 0;
    Delimited = true;
    ++ThisTokBuf;
    if (*ThisTokBuf == '}') {
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
           diag::err_delimited_escape_empty);
      return ResultChar;
    }

    while (ThisTokBuf != ThisTokEnd) {
      if (*ThisTokBuf == '}') {
        EndDelimiterFound = true;
        ThisTokBuf++;
        break;
      }
      if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
        HadError = true;
        if (Diags)
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
               diag::err_delimited_escape_invalid)
              << StringRef(ThisTokBuf, 1);
        ThisTokBuf++;
        continue;
      }
      if (ResultChar & 0x020000000)
        Overflow = true;

      ResultChar <<= 3;
      ResultChar |= *ThisTokBuf++ - '0';
    }
    // Check for overflow.  Reject '\777', but not L'\777'.
    if (!HadError &&
        (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
      HadError = true;
      if (Diags)
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
             diag::err_escape_too_large)
            << 1;
      ResultChar &= ~0U >> (32 - CharWidth);
    }
    break;
  }
    // Otherwise, these are not valid escapes.
  case '(': case '{': case '[': case '%':
    // GCC accepts these as extensions.  We warn about them as such though.
@@ -224,6 +302,17 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
    break;
  }

  if (Delimited && Diags) {
    if (!EndDelimiterFound)
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
           diag::err_expected)
          << tok::r_brace;
    else if (!HadError) {
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
           diag::ext_delimited_escape_sequence);
    }
  }

  return ResultChar;
}

@@ -245,18 +334,32 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
    }

    ++I;
    assert(*I == 'u' || *I == 'U');
    char Kind = *I;
    ++I;

    assert(Kind == 'u' || Kind == 'U');
    uint32_t CodePoint = 0;

    if (Kind == 'u' && *I == '{') {
      for (++I; *I != '}'; ++I) {
        unsigned Value = llvm::hexDigitValue(*I);
        assert(Value != -1U);
        CodePoint <<= 4;
        CodePoint += Value;
      }
      appendCodePoint(CodePoint, Buf);
      continue;
    }

    unsigned NumHexDigits;
    if (*I == 'u')
    if (Kind == 'u')
      NumHexDigits = 4;
    else
      NumHexDigits = 8;

    assert(I + NumHexDigits <= E);

    uint32_t CodePoint = 0;
    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
    for (; NumHexDigits != 0; ++I, --NumHexDigits) {
      unsigned Value = llvm::hexDigitValue(*I);
      assert(Value != -1U);

@@ -282,28 +385,82 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  // Skip the '\u' char's.
  ThisTokBuf += 2;

  if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
  bool Delimited = false;
  bool EndDelimiterFound = false;
  bool HasError = false;

  if (UcnBegin[1] == 'u' && in_char_string_literal &&
      ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
    Delimited = true;
    ThisTokBuf++;
  } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
    if (Diags)
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
    return false;
  }
  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
  unsigned short UcnLenSave = UcnLen;
  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
    int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
    if (CharVal == -1) break;

  bool Overflow = false;
  unsigned short Count = 0;
  for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
       ++ThisTokBuf) {
    if (Delimited && *ThisTokBuf == '}') {
      ++ThisTokBuf;
      EndDelimiterFound = true;
      break;
    }
    int CharVal = llvm::hexDigitValue(*ThisTokBuf);
    if (CharVal == -1) {
      HasError = true;
      if (!Delimited)
        break;
      if (Diags) {
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
             diag::err_delimited_escape_invalid)
            << StringRef(ThisTokBuf, 1);
      }
      Count++;
      continue;
    }
    if (UcnVal & 0xF0000000) {
      Overflow = true;
      continue;
    }
    UcnVal <<= 4;
    UcnVal |= CharVal;
    Count++;
  }

  if (Overflow) {
    if (Diags)
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
           diag::err_escape_too_large)
          << 0;
    return false;
  }

  if (Delimited && !EndDelimiterFound) {
    if (Diags) {
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
           diag::err_expected)
          << tok::r_brace;
    }
    return false;
  }

  // If we didn't consume the proper number of digits, there is a problem.
  if (UcnLenSave) {
  if (Count == 0 || (!Delimited && Count != UcnLen)) {
    if (Diags)
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
           diag::err_ucn_escape_incomplete);
           Delimited ? diag::err_delimited_escape_empty
                     : diag::err_ucn_escape_incomplete);
    return false;
  }

  if (HasError)
    return false;

  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
  if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
      UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
@@ -338,6 +495,10 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
         diag::warn_ucn_not_valid_in_c89_literal);

  if (Delimited && Diags)
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
         diag::ext_delimited_escape_sequence);

  return true;
}

+81 −0
Original line number Diff line number Diff line
// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s
// RUN: %clang_cc1 -x c -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s

const char *errors =
    "\u{}"  //expected-error {{delimited escape sequence cannot be empty}}
    "\u{"   //expected-error {{expected '}'}}
    "\u{h}" //expected-error {{invalid digit 'h' in escape sequence}}
    "\x{}"  //expected-error {{delimited escape sequence cannot be empty}}
    "\x{"   //expected-error {{expected '}'}}
    "\x{h}" //expected-error {{invalid digit 'h' in escape sequence}}
    "\o{}"  //expected-error {{delimited escape sequence cannot be empty}}
    "\o{"   //expected-error {{expected '}'}}
    "\o{8}" //expected-error {{invalid digit '8' in escape sequence}}
    ;

void ucn() {
  char a = '\u{1234}'; // expected-error {{character too large for enclosing character literal type}}
                       // expected-warning@-1 {{delimited escape sequences are a Clang extension}}

  unsigned b = U'\u{1234}'; // expected-warning {{extension}}

#ifdef __cplusplus
  unsigned b2 = U'\u{1}'; // expected-warning {{extension}}
#else
  unsigned b2 = U'\u{1}';     //expected-error {{universal character name refers to a control character}}
#endif

  unsigned c = U'\u{000000000001234}'; // expected-warning {{extension}}
  unsigned d = U'\u{111111111}';       //expected-error {{hex escape sequence out of range}}
}

void hex() {
  char a = '\x{1}';             // expected-warning {{extension}}
  char b = '\x{abcdegggggabc}'; // expected-error 5{{invalid digit 'g' in escape sequence}}
  char c = '\x{ff1}';           // expected-error {{hex escape sequence out of range}}

#if __WCHAR_MAX__ > 0xFFFF
  unsigned d = L'\x{FFFFFFFF}';  // expected-warning {{extension}}
  unsigned e = L'\x{100000000}'; // expected-error {{hex escape sequence out of range}}
#else
  unsigned f = L'\x{FFFF}';   // expected-warning {{extension}}
  unsigned g = L'\x{10000}';  // expected-error {{hex escape sequence out of range}}
#endif
  unsigned h = U'\x{FFFFFFFF}';  // expected-warning {{extension}}
  unsigned i = U'\x{100000000}'; // expected-error {{hex escape sequence out of range}}
}

void octal() {
  char a = '\o{1}';              // expected-warning {{extension}}
  char b = '\o{12345678881238}'; // expected-error 4{{invalid digit '8' in escape sequence}}
  char c = '\o{777}';            // //expected-error {{octal escape sequence out of range}}
#if __WCHAR_MAX__ > 0xFFFF
  unsigned d = L'\o{37777777777}'; // expected-warning {{extension}}
  unsigned e = L'\o{40000000000}'; // expected-error {{octal escape sequence out of range}}
#else
  unsigned d = L'\o{177777}'; // expected-warning {{extension}}
  unsigned e = L'\o{200000}'; // expected-error {{octal escape sequence out of range}}
#endif
}

void concat() {
  (void)"\x{" "12}"; // expected-error {{expected '}'}}
  (void)"\u{" "12}"; // expected-error {{expected '}'}}
  (void)"\o{" "12}"; // expected-error {{expected '}'}}

  (void)"\x{12" "}"; // expected-error {{expected '}'}}
  (void)"\u{12" "}"; // expected-error {{expected '}'}}
  (void)"\o{12" "}"; // expected-error {{expected '}'}}
}

void separators() {
  (void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
  (void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}
  (void)"\o{12'3}"; // expected-error {{invalid digit ''' in escape sequence}}

  '\x{12'3'}';   // expected-error {{expected '}'}}
                 // expected-error@-1 2{{expected ';'}}
                 // expected-warning@-2 3{{expression result unused}}
}
+3 −0
Original line number Diff line number Diff line
@@ -129,6 +129,9 @@ int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator
int operator""_""_\u212e""_\U0000212e""(const char*, size_t);
int operator""_\u212e""_\U0000212e""_""(const char*, size_t);
int operator""_\U0000212e""_""_\u212e""(const char*, size_t);

int operator""_\u{212f}(char);

int mix_ucn_utf8 = ""_""_\u212e""_\U0000212e"";

void operator""_""_(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}}
Loading