Unverified Commit 4480e650 authored by akirchhoff-modular's avatar akirchhoff-modular Committed by GitHub
Browse files

[YAMLParser] Improve plain scalar spec compliance (#68946)

The `YAMLParser.h` header file claims support for YAML 1.2 with a few
deviations, but our plain scalar parsing failed to parse some valid YAML
according to the spec. This change puts us more in compliance with the
YAML spec, now letting us parse plain scalars containing additional
special characters in cases where they are not ambiguous.
parent 2a40ec2d
Loading
Loading
Loading
Loading
+44 −27
Original line number Diff line number Diff line
@@ -392,6 +392,10 @@ private:
  ///        Pos is whitespace or a new line
  bool isBlankOrBreak(StringRef::iterator Position);

  /// Return true if the minimal well-formed code unit subsequence at
  ///        Pos is considered a "safe" character for plain scalars.
  bool isPlainSafeNonBlank(StringRef::iterator Position);

  /// Return true if the line is a line break, false otherwise.
  bool isLineEmpty(StringRef Line);

@@ -545,6 +549,10 @@ private:
  /// Can the next token be the start of a simple key?
  bool IsSimpleKeyAllowed;

  /// Can the next token be a value indicator even if it does not have a
  /// trailing space?
  bool IsAdjacentValueAllowedInFlow;

  /// True if an error has occurred.
  bool Failed;

@@ -868,6 +876,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
  FlowLevel = 0;
  IsStartOfStream = true;
  IsSimpleKeyAllowed = true;
  IsAdjacentValueAllowedInFlow = false;
  Failed = false;
  std::unique_ptr<MemoryBuffer> InputBufferOwner =
      MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
@@ -1049,6 +1058,15 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
         *Position == '\n';
}

bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) {
  if (Position == End || isBlankOrBreak(Position))
    return false;
  if (FlowLevel &&
      StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos)
    return false;
  return true;
}

bool Scanner::isLineEmpty(StringRef Line) {
  for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
    if (!isBlankOrBreak(Position))
@@ -1189,6 +1207,7 @@ bool Scanner::scanStreamEnd() {
  unrollIndent(-1);
  SimpleKeys.clear();
  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  Token T;
  T.Kind = Token::TK_StreamEnd;
@@ -1202,6 +1221,7 @@ bool Scanner::scanDirective() {
  unrollIndent(-1);
  SimpleKeys.clear();
  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  StringRef::iterator Start = Current;
  consume('%');
@@ -1233,6 +1253,7 @@ bool Scanner::scanDocumentIndicator(bool IsStart) {
  unrollIndent(-1);
  SimpleKeys.clear();
  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  Token T;
  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
@@ -1255,6 +1276,8 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {

  // And may also be followed by a simple key.
  IsSimpleKeyAllowed = true;
  // Adjacent values are allowed in flows only after JSON-style keys.
  IsAdjacentValueAllowedInFlow = false;
  ++FlowLevel;
  return true;
}
@@ -1262,6 +1285,7 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {
bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = true;
  Token T;
  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
                      : Token::TK_FlowMappingEnd;
@@ -1276,6 +1300,7 @@ bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
bool Scanner::scanFlowEntry() {
  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  IsSimpleKeyAllowed = true;
  IsAdjacentValueAllowedInFlow = false;
  Token T;
  T.Kind = Token::TK_FlowEntry;
  T.Range = StringRef(Current, 1);
@@ -1288,6 +1313,7 @@ bool Scanner::scanBlockEntry() {
  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  IsSimpleKeyAllowed = true;
  IsAdjacentValueAllowedInFlow = false;
  Token T;
  T.Kind = Token::TK_BlockEntry;
  T.Range = StringRef(Current, 1);
@@ -1302,6 +1328,7 @@ bool Scanner::scanKey() {

  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  IsSimpleKeyAllowed = !FlowLevel;
  IsAdjacentValueAllowedInFlow = false;

  Token T;
  T.Kind = Token::TK_Key;
@@ -1339,6 +1366,7 @@ bool Scanner::scanValue() {
      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
    IsSimpleKeyAllowed = !FlowLevel;
  }
  IsAdjacentValueAllowedInFlow = false;

  Token T;
  T.Kind = Token::TK_Value;
@@ -1420,6 +1448,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = true;

  return true;
}
@@ -1434,21 +1463,9 @@ bool Scanner::scanPlainScalar() {
    if (*Current == '#')
      break;

    while (Current != End && !isBlankOrBreak(Current)) {
      if (FlowLevel && *Current == ':' &&
          (Current + 1 == End ||
           !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
        setError("Found unexpected ':' while scanning a plain scalar", Current);
        return false;
      }

      // Check for the end of the plain scalar.
      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
          || (  FlowLevel
          && (StringRef(Current, 1).find_first_of(",:?[]{}")
              != StringRef::npos)))
        break;

    while (Current != End &&
           ((*Current != ':' && isPlainSafeNonBlank(Current)) ||
            (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) {
      StringRef::iterator i = skip_nb_char(Current);
      if (i == Current)
        break;
@@ -1499,6 +1516,7 @@ bool Scanner::scanPlainScalar() {
  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  return true;
}
@@ -1534,6 +1552,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  return true;
}
@@ -1766,6 +1785,7 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
  // New lines may start a simple key.
  if (!FlowLevel)
    IsSimpleKeyAllowed = true;
  IsAdjacentValueAllowedInFlow = false;

  Token T;
  T.Kind = Token::TK_BlockScalar;
@@ -1799,6 +1819,7 @@ bool Scanner::scanTag() {
  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

  IsSimpleKeyAllowed = false;
  IsAdjacentValueAllowedInFlow = false;

  return true;
}
@@ -1848,13 +1869,14 @@ bool Scanner::fetchMoreTokens() {
  if (*Current == ',')
    return scanFlowEntry();

  if (*Current == '-' && isBlankOrBreak(Current + 1))
  if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End))
    return scanBlockEntry();

  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
  if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1)))
    return scanKey();

  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
  if (*Current == ':' &&
      (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow))
    return scanValue();

  if (*Current == '*')
@@ -1880,15 +1902,10 @@ bool Scanner::fetchMoreTokens() {

  // Get a plain scalar.
  StringRef FirstChar(Current, 1);
  if (!(isBlankOrBreak(Current)
        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
      || (*Current == '-' && !isBlankOrBreak(Current + 1))
      || (!FlowLevel && (*Current == '?' || *Current == ':')
          && isBlankOrBreak(Current + 1))
      || (!FlowLevel && *Current == ':'
                      && Current + 2 < End
                      && *(Current + 1) == ':'
                      && !isBlankOrBreak(Current + 2)))
  if ((!isBlankOrBreak(Current) &&
       FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) ||
      (FirstChar.find_first_of("?:-") != StringRef::npos &&
       isPlainSafeNonBlank(Current + 1)))
    return scanPlainScalar();

  setError("Unrecognized character while tokenizing.", Current);
+2 −2
Original line number Diff line number Diff line
:# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s
@# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s

# The : before the run comment is syntactically invalid. This used to
# The @ before the run comment is syntactically invalid. This used to
# crash in the SourceMgr diagnostic printer because it was called
# before the LLVMContext was initialized.

+30 −0
Original line number Diff line number Diff line
# RUN: yaml-bench -canonical %s | FileCheck %s
# Example from https://yaml.org/spec/1.2.2/#example-plain-characters

# Outside flow collection:
- ::vector
- ": - ()"
- Up, up, and away!
- -123
- https://example.com/foo#bar
# Inside flow collection:
- [ ::vector,
  ": - ()",
  "Up, up and away!",
  -123,
  https://example.com/foo#bar ]

# CHECK: !!seq [
# CHECK-NEXT:   !!str "::vector",
# CHECK-NEXT:   !!str ": - ()",
# CHECK-NEXT:   !!str "Up, up, and away!",
# CHECK-NEXT:   !!str "-123",
# CHECK-NEXT:   !!str "https://example.com/foo#bar",
# CHECK-NEXT:   !!seq [
# CHECK-NEXT:     !!str "::vector",
# CHECK-NEXT:     !!str ": - ()",
# CHECK-NEXT:     !!str "Up, up and away!",
# CHECK-NEXT:     !!str "-123",
# CHECK-NEXT:     !!str "https://example.com/foo#bar",
# CHECK-NEXT:   ],
# CHECK-NEXT: ]
+1 −1
Original line number Diff line number Diff line
@@ -3156,7 +3156,7 @@ TEST(YAMLIO, TestFlowSequenceTokenErrors) {

TEST(YAMLIO, TestDirectiveMappingNoValue) {
  Input yin("%YAML\n{5:");
  EXPECT_FALSE(yin.setCurrentDocument());
  yin.setCurrentDocument();
  EXPECT_TRUE(yin.error());

  Input yin2("%TAG\n'\x98!< :\n");
+60 −1
Original line number Diff line number Diff line
@@ -47,6 +47,10 @@ TEST(YAMLParser, ParsesEmptyArray) {
  ExpectParseSuccess("Empty array", "[]");
}

TEST(YAMLParser, ParsesComplexMap) {
  ExpectParseSuccess("Complex block map", "? a\n: b");
}

TEST(YAMLParser, FailsIfNotClosingArray) {
  ExpectParseError("Not closing array", "[");
  ExpectParseError("Not closing array", "  [  ");
@@ -82,7 +86,10 @@ TEST(YAMLParser, FailsIfMissingColon) {
}

TEST(YAMLParser, FailsOnMissingQuote) {
  ExpectParseError("Missing open quote", "[{a\":\"b\"}]");
  // Missing open quote counts as a plain scalar per YAML spec
  // (Following is equivalent to JSON [{"a\":\"b\"": null}])
  ExpectParseSuccess("Missing open quote", "[{a\":\"b\"}]");
  // Closing quote is more strict -- plain scalars cannot start with a quote
  ExpectParseError("Missing closing quote", "[{\"a\":\"b}]");
}

@@ -128,6 +135,48 @@ TEST(YAMLParser, ParsesArrayOfArrays) {
  ExpectParseSuccess("Array of arrays", "[[]]");
}

TEST(YAMLParser, ParsesPlainScalars) {
  ExpectParseSuccess("Plain scalar", "hello");
  ExpectParseSuccess("Plain scalar beginning with a question mark", "?hello");
  ExpectParseSuccess("Plain scalar beginning with a colon", ":hello");
  ExpectParseSuccess("Plain scalar beginning with two colons", "::hello");
  ExpectParseSuccess("Plain scalar beginning with a hyphen", "-hello");
  ExpectParseSuccess("Multi-line plain scalar", "Hello\nworld");
  ExpectParseSuccess("Plain scalar with indicator characters",
                     "He-!l*lo, []world{}");
  ExpectParseSuccess("Plain scalar with indicator characters used as block key",
                     "He-!l*lo, []world{}: value");
  ExpectParseSuccess("Plain scalar in flow sequence", "hello");
  ExpectParseSuccess(
      "Plain scalar beginning with a question mark in flow sequence",
      "[ ?hello ]");
  ExpectParseSuccess("Plain scalar beginning with a colon in flow sequence",
                     "[ :hello ]");
  ExpectParseSuccess("Plain scalar beginning with two colons in flow sequence",
                     "[ ::hello ]");
  ExpectParseSuccess("Plain scalar beginning with a hyphen in flow sequence",
                     "[ -hello ]");
  ExpectParseSuccess("Multi-line plain scalar in flow sequence",
                     "[ Hello\nworld ]");
  ExpectParseSuccess(
      "Plain scalar with non-flow indicator characters in flow sequence",
      "[ He-!l*lo, world ]");
  ExpectParseSuccess(
      "Plain scalar with non-flow indicator characters used as flow key",
      "{ He-!l*lo, world: value } ");
  ExpectParseError(
      "Plain scalar with flow indicator characters inside flow sequence",
      "[ Hello[world ]");
  ExpectParseError(
      "Plain scalar with flow indicator characters inside flow key",
      "{ Hello[world: value }");
  // Multi-line plain scalar in keys is strictly invalid per the spec, but many
  // implementations accept it in flow keys nonetheless.  Block keys are not
  // accepted by any other implementation I can find.
  ExpectParseSuccess("Multi-line plain scalar in block key", "a\nb: c");
  ExpectParseSuccess("Multi-line plain scalar in flow key", "{\na\nb: c\n}");
}

TEST(YAMLParser, ParsesBlockLiteralScalars) {
  ExpectParseSuccess("Block literal scalar", "test: |\n  Hello\n  World\n");
  ExpectParseSuccess("Block literal scalar EOF", "test: |\n  Hello\n  World");
@@ -176,6 +225,10 @@ TEST(YAMLParser, HandlesEndOfFileGracefully) {
  ExpectParseError("In array hitting EOF", "[[] ");
  ExpectParseError("In array hitting EOF", "[[]");
  ExpectParseError("In object hitting EOF", "{\"\"");
  // This one is valid, equivalent to the JSON {"": null}
  ExpectParseSuccess("In complex block map hitting EOF", "?");
  // Equivalent to JSON [null]
  ExpectParseSuccess("In block sequence hitting EOF", "-");
}

TEST(YAMLParser, HandlesNullValuesInKeyValueNodesGracefully) {
@@ -183,6 +236,12 @@ TEST(YAMLParser, HandlesNullValuesInKeyValueNodesGracefully) {
  ExpectParseError("KeyValueNode with null value", "test: '");
}

TEST(YAMLParser, BlockSequenceEOF) {
  SourceMgr SM;
  yaml::Stream Stream("-", SM);
  EXPECT_TRUE(isa_and_present<yaml::SequenceNode>(Stream.begin()->getRoot()));
}

// Checks that the given string can be parsed into an identical string inside
// of an array.
static void ExpectCanParseString(StringRef String) {