Commit 8fa5e98f authored by paulhoad's avatar paulhoad
Browse files

[clang-format] Remove duplciate code from Invalid BOM detection

Summary:
Review comments on {D68767} asked that this duplicated code in clang-format was moved to one central location that being SourceManager (where it had originally be copied from I assume)

Moved function into static function  ContentCache::getInvalidBOM(...)  - (closest class to where it was defined before)
Updated clang-format to call this static function

Added unit tests for said new function in BasicTests

Sorry not my normal code area so may have the wrong reviewers. (but your names were on the recent history)

Reviewers: bruno, arphaman, klimek, owenpan, mitchell-stellar, dexonsmith

Reviewed By: owenpan

Subscribers: cfe-commits

Tags: #clang, #clang-format, #clang-tools-extra

Differential Revision: https://reviews.llvm.org/D68914
parent 78700ef8
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -226,6 +226,10 @@ namespace SrcMgr {
    bool shouldFreeBuffer() const {
      return (Buffer.getInt() & DoNotFreeFlag) == 0;
    }

    // If BufStr has an invalid BOM, returns the BOM name; otherwise, returns
    // nullptr
    static const char *getInvalidBOM(StringRef BufStr);
  };

  // Assert that the \c ContentCache objects will always be 8-byte aligned so
+24 −14
Original line number Diff line number Diff line
@@ -95,6 +95,29 @@ void ContentCache::replaceBuffer(const llvm::MemoryBuffer *B, bool DoNotFree) {
  Buffer.setInt((B && DoNotFree) ? DoNotFreeFlag : 0);
}

const char *ContentCache::getInvalidBOM(StringRef BufStr) {
  // If the buffer is valid, check to see if it has a UTF Byte Order Mark
  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
  // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
  const char *InvalidBOM =
      llvm::StringSwitch<const char *>(BufStr)
          .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"),
                      "UTF-32 (BE)")
          .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"),
                      "UTF-32 (LE)")
          .StartsWith("\xFE\xFF", "UTF-16 (BE)")
          .StartsWith("\xFF\xFE", "UTF-16 (LE)")
          .StartsWith("\x2B\x2F\x76", "UTF-7")
          .StartsWith("\xF7\x64\x4C", "UTF-1")
          .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC")
          .StartsWith("\x0E\xFE\xFF", "SCSU")
          .StartsWith("\xFB\xEE\x28", "BOCU-1")
          .StartsWith("\x84\x31\x95\x33", "GB-18030")
          .Default(nullptr);

  return InvalidBOM;
}

const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
                                                  FileManager &FM,
                                                  SourceLocation Loc,
@@ -190,20 +213,7 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(DiagnosticsEngine &Diag,
  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
  // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
  StringRef BufStr = Buffer.getPointer()->getBuffer();
  const char *InvalidBOM = llvm::StringSwitch<const char *>(BufStr)
    .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"),
                                                  "UTF-32 (BE)")
    .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"),
                                                  "UTF-32 (LE)")
    .StartsWith("\xFE\xFF", "UTF-16 (BE)")
    .StartsWith("\xFF\xFE", "UTF-16 (LE)")
    .StartsWith("\x2B\x2F\x76", "UTF-7")
    .StartsWith("\xF7\x64\x4C", "UTF-1")
    .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC")
    .StartsWith("\x0E\xFE\xFF", "SCSU")
    .StartsWith("\xFB\xEE\x28", "BOCU-1")
    .StartsWith("\x84\x31\x95\x33", "GB-18030")
    .Default(nullptr);
  const char *InvalidBOM = getInvalidBOM(BufStr);

  if (InvalidBOM) {
    Diag.Report(Loc, diag::err_unsupported_bom)
+1 −26
Original line number Diff line number Diff line
@@ -289,31 +289,6 @@ static void outputReplacementsXML(const Replacements &Replaces) {
  }
}

// If BufStr has an invalid BOM, returns the BOM name; otherwise, returns
// nullptr.
static const char *getInValidBOM(StringRef BufStr) {
  // Check to see if the buffer has a UTF Byte Order Mark (BOM).
  // We only support UTF-8 with and without a BOM right now.  See
  // https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
  // for more information.
  const char *InvalidBOM =
      llvm::StringSwitch<const char *>(BufStr)
          .StartsWith(llvm::StringLiteral::withInnerNUL("\x00\x00\xFE\xFF"),
                      "UTF-32 (BE)")
          .StartsWith(llvm::StringLiteral::withInnerNUL("\xFF\xFE\x00\x00"),
                      "UTF-32 (LE)")
          .StartsWith("\xFE\xFF", "UTF-16 (BE)")
          .StartsWith("\xFF\xFE", "UTF-16 (LE)")
          .StartsWith("\x2B\x2F\x76", "UTF-7")
          .StartsWith("\xF7\x64\x4C", "UTF-1")
          .StartsWith("\xDD\x73\x66\x73", "UTF-EBCDIC")
          .StartsWith("\x0E\xFE\xFF", "SCSU")
          .StartsWith("\xFB\xEE\x28", "BOCU-1")
          .StartsWith("\x84\x31\x95\x33", "GB-18030")
          .Default(nullptr);
  return InvalidBOM;
}

static bool
emitReplacementWarnings(const Replacements &Replaces, StringRef AssumedFileName,
                        const std::unique_ptr<llvm::MemoryBuffer> &Code) {
@@ -412,7 +387,7 @@ static bool format(StringRef FileName) {

  StringRef BufStr = Code->getBuffer();

  const char *InvalidBOM = getInValidBOM(BufStr);
  const char *InvalidBOM = SrcMgr::ContentCache::getInvalidBOM(BufStr);

  if (InvalidBOM) {
    errs() << "error: encoding with unsupported byte order mark \""
+41 −0
Original line number Diff line number Diff line
@@ -200,6 +200,47 @@ TEST_F(SourceManagerTest, locationPrintTest) {
            "</mainFile.cpp:1:1, /test-header.h:1:1>");
}

TEST_F(SourceManagerTest, getInvalidBOM) {
  ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM(""), nullptr);
  ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("\x00\x00\x00"), nullptr);
  ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("\xFF\xFF\xFF"), nullptr);
  ASSERT_EQ(SrcMgr::ContentCache::getInvalidBOM("#include <iostream>"),
            nullptr);

  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\xFE\xFF#include <iostream>")),
            "UTF-16 (BE)");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\xFF\xFE#include <iostream>")),
            "UTF-16 (LE)");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\x2B\x2F\x76#include <iostream>")),
            "UTF-7");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\xF7\x64\x4C#include <iostream>")),
            "UTF-1");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\xDD\x73\x66\x73#include <iostream>")),
            "UTF-EBCDIC");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\x0E\xFE\xFF#include <iostream>")),
            "SCSU");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\xFB\xEE\x28#include <iostream>")),
            "BOCU-1");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                "\x84\x31\x95\x33#include <iostream>")),
            "GB-18030");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                llvm::StringLiteral::withInnerNUL(
                    "\x00\x00\xFE\xFF#include <iostream>"))),
            "UTF-32 (BE)");
  ASSERT_EQ(StringRef(SrcMgr::ContentCache::getInvalidBOM(
                llvm::StringLiteral::withInnerNUL(
                    "\xFF\xFE\x00\x00#include <iostream>"))),
            "UTF-32 (LE)");
}

#if defined(LLVM_ON_UNIX)

TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {