Split long lines in multi-line comments.

Summary: This is implementation for /* */ comments only.

Reviewers: djasper, klimek

Reviewed By: djasper

CC: cfe-commits

Differential Revision: http://llvm-reviews.chandlerc.com/D547

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@177415 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index a17823f..dd8f6cd 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -103,13 +103,13 @@
 /// trailing line comments.
 class WhitespaceManager {
 public:
-  WhitespaceManager(SourceManager &SourceMgr) : SourceMgr(SourceMgr) {}
+  WhitespaceManager(SourceManager &SourceMgr, const FormatStyle &Style)
+      : SourceMgr(SourceMgr), Style(Style) {}
 
   /// \brief Replaces the whitespace in front of \p Tok. Only call once for
   /// each \c AnnotatedToken.
   void replaceWhitespace(const AnnotatedToken &Tok, unsigned NewLines,
-                         unsigned Spaces, unsigned WhitespaceStartColumn,
-                         const FormatStyle &Style) {
+                         unsigned Spaces, unsigned WhitespaceStartColumn) {
     // 2+ newlines mean an empty line separating logic scopes.
     if (NewLines >= 2)
       alignComments();
@@ -146,7 +146,7 @@
       alignComments();
 
     if (Tok.Type == TT_BlockComment)
-      indentBlockComment(Tok.FormatTok, Spaces);
+      indentBlockComment(Tok, Spaces, false);
 
     storeReplacement(Tok.FormatTok, getNewLineText(NewLines, Spaces));
   }
@@ -157,11 +157,12 @@
   /// This function and \c replaceWhitespace have the same behavior if
   /// \c Newlines == 0.
   void replacePPWhitespace(const AnnotatedToken &Tok, unsigned NewLines,
-                           unsigned Spaces, unsigned WhitespaceStartColumn,
-                           const FormatStyle &Style) {
-    storeReplacement(
-        Tok.FormatTok,
-        getNewLineText(NewLines, Spaces, WhitespaceStartColumn, Style));
+                           unsigned Spaces, unsigned WhitespaceStartColumn) {
+    if (Tok.Type == TT_BlockComment)
+      indentBlockComment(Tok, Spaces, true);
+
+    storeReplacement(Tok.FormatTok,
+                     getNewLineText(NewLines, Spaces, WhitespaceStartColumn));
   }
 
   /// \brief Inserts a line break into the middle of a token.
@@ -171,19 +172,19 @@
   ///
   /// \p InPPDirective, \p Spaces, \p WhitespaceStartColumn and \p Style are
   /// used to generate the correct line break.
-  void breakToken(const AnnotatedToken &Tok, unsigned Offset, StringRef Prefix,
-                  StringRef Postfix, bool InPPDirective, unsigned Spaces,
-                  unsigned WhitespaceStartColumn, const FormatStyle &Style) {
+  void breakToken(const FormatToken &Tok, unsigned Offset,
+                  unsigned ReplaceChars, StringRef Prefix, StringRef Postfix,
+                  bool InPPDirective, unsigned Spaces,
+                  unsigned WhitespaceStartColumn) {
     std::string NewLineText;
     if (!InPPDirective)
       NewLineText = getNewLineText(1, Spaces);
     else
-      NewLineText = getNewLineText(1, Spaces, WhitespaceStartColumn, Style);
+      NewLineText = getNewLineText(1, Spaces, WhitespaceStartColumn);
     std::string ReplacementText = (Prefix + NewLineText + Postfix).str();
-    SourceLocation InsertAt = Tok.FormatTok.WhiteSpaceStart
-        .getLocWithOffset(Tok.FormatTok.WhiteSpaceLength + Offset);
-    Replaces.insert(
-        tooling::Replacement(SourceMgr, InsertAt, 0, ReplacementText));
+    SourceLocation Location = Tok.Tok.getLocation().getLocWithOffset(Offset);
+    Replaces.insert(tooling::Replacement(SourceMgr, Location, ReplaceChars,
+                                         ReplacementText));
   }
 
   /// \brief Returns all the \c Replacements created during formatting.
@@ -193,33 +194,122 @@
   }
 
 private:
-  void indentBlockComment(const FormatToken &Tok, int Indent) {
-    SourceLocation TokenLoc = Tok.Tok.getLocation();
-    int IndentDelta = Indent - SourceMgr.getSpellingColumnNumber(TokenLoc) + 1;
-    const char *Start = SourceMgr.getCharacterData(TokenLoc);
-    const char *Current = Start;
-    const char *TokEnd = Current + Tok.TokenLength;
-    llvm::SmallVector<SourceLocation, 16> LineStarts;
-    while (Current < TokEnd) {
-      if (*Current == '\n') {
-        ++Current;
-        LineStarts.push_back(TokenLoc.getLocWithOffset(Current - Start));
-        // If we need to outdent the line, check that it's indented enough.
-        for (int i = 0; i < -IndentDelta; ++i, ++Current)
-          if (Current >= TokEnd || *Current != ' ')
-            return;
-      } else {
-        ++Current;
+  /// \brief Finds a common prefix of lines of a block comment to properly
+  /// indent (and possibly decorate with '*'s) added lines.
+  ///
+  /// The first line is ignored (it's special and starts with /*).
+  /// When there are less than three lines, we don't have enough information, so
+  /// better use no prefix.
+  static StringRef findCommentLinesPrefix(ArrayRef<StringRef> Lines,
+                                          const char *PrefixChars = " *") {
+    if (Lines.size() < 3)
+      return "";
+    StringRef Prefix(Lines[1].data(), Lines[1].find_first_not_of(PrefixChars));
+    for (size_t i = 2; i < Lines.size(); ++i) {
+      for (size_t j = 0; j < Prefix.size() && j < Lines[i].size(); ++j) {
+        if (Prefix[j] != Lines[i][j]) {
+          Prefix = Prefix.substr(0, j);
+          break;
+        }
+      }
+    }
+    return Prefix;
+  }
+
+  void splitLineInComment(const FormatToken &Tok, StringRef Line,
+                          size_t StartColumn, StringRef LinePrefix,
+                          bool InPPDirective, bool CommentHasMoreLines,
+                          const char *WhiteSpaceChars = " ") {
+    size_t ColumnLimit =
+        Style.ColumnLimit - LinePrefix.size() - (InPPDirective ? 2 : 0);
+
+    if (Line.size() <= ColumnLimit)
+      return;
+
+    const char *TokenStart = SourceMgr.getCharacterData(Tok.Tok.getLocation());
+    while (Line.rtrim().size() > ColumnLimit) {
+      // Try to break at the last whitespace before the column limit.
+      size_t SpacePos =
+          Line.find_last_of(WhiteSpaceChars, ColumnLimit + 1);
+      if (SpacePos == StringRef::npos) {
+        // Try to find any whitespace in the line.
+        SpacePos = Line.find_first_of(WhiteSpaceChars);
+        if (SpacePos == StringRef::npos) // No whitespace found, give up.
+          break;
+      }
+
+      StringRef NextCut = Line.substr(0, SpacePos).rtrim();
+      StringRef RemainingLine = Line.substr(SpacePos).ltrim();
+      if (RemainingLine.empty())
+        break;
+      Line = RemainingLine;
+
+      size_t ReplaceChars = Line.begin() - NextCut.end();
+      breakToken(Tok, NextCut.end() - TokenStart, ReplaceChars, "", LinePrefix,
+                 InPPDirective, 0,
+                 NextCut.size() + LinePrefix.size() + StartColumn);
+      StartColumn = 0;
+    }
+
+    StringRef TrimmedLine = Line.rtrim();
+    if (TrimmedLine != Line || (InPPDirective && CommentHasMoreLines)) {
+      // Remove trailing whitespace/insert backslash.
+      breakToken(Tok, TrimmedLine.end() - TokenStart,
+                 Line.size() - TrimmedLine.size() + 1, "", "", InPPDirective, 0,
+                 TrimmedLine.size() + LinePrefix.size());
+    }
+  }
+
+  void indentBlockComment(const AnnotatedToken &Tok, int Indent,
+                          bool InPPDirective) {
+    const SourceLocation TokenLoc = Tok.FormatTok.Tok.getLocation();
+    const int CurrentIndent = SourceMgr.getSpellingColumnNumber(TokenLoc) - 1;
+    const int IndentDelta = Indent - CurrentIndent;
+    const StringRef Text(SourceMgr.getCharacterData(TokenLoc),
+                         Tok.FormatTok.TokenLength);
+    assert(Text.startswith("/*") && Text.endswith("*/"));
+
+    SmallVector<StringRef, 16> Lines;
+    Text.split(Lines, "\n");
+
+    if (IndentDelta > 0) {
+      std::string WhiteSpace(IndentDelta, ' ');
+      for (size_t i = 1; i < Lines.size(); ++i) {
+        Replaces.insert(tooling::Replacement(
+            SourceMgr, TokenLoc.getLocWithOffset(Lines[i].data() - Text.data()),
+            0, WhiteSpace));
+      }
+    } else if (IndentDelta < 0) {
+      std::string WhiteSpace(-IndentDelta, ' ');
+      // Check that the line is indented enough.
+      for (size_t i = 1; i < Lines.size(); ++i) {
+        if (!Lines[i].startswith(WhiteSpace))
+          return;
+      }
+      for (size_t i = 1; i < Lines.size(); ++i) {
+        Replaces.insert(tooling::Replacement(
+            SourceMgr, TokenLoc.getLocWithOffset(Lines[i].data() - Text.data()),
+            -IndentDelta, ""));
       }
     }
 
-    for (size_t i = 0; i < LineStarts.size(); ++i) {
-      if (IndentDelta > 0)
-        Replaces.insert(tooling::Replacement(SourceMgr, LineStarts[i], 0,
-                                             std::string(IndentDelta, ' ')));
-      else if (IndentDelta < 0)
-        Replaces.insert(
-            tooling::Replacement(SourceMgr, LineStarts[i], -IndentDelta, ""));
+    // Split long lines in comments.
+    const StringRef CurrentPrefix = findCommentLinesPrefix(Lines);
+    size_t PrefixSize = CurrentPrefix.size();
+    std::string NewPrefix =
+        (IndentDelta < 0) ? CurrentPrefix.substr(-IndentDelta).str()
+                          : std::string(IndentDelta, ' ') + CurrentPrefix.str();
+
+    if (CurrentPrefix.endswith("*")) {
+      NewPrefix += " ";
+      ++PrefixSize;
+    }
+
+    for (size_t i = 0; i < Lines.size(); ++i) {
+      StringRef Line = (i == 0) ? Lines[i] : Lines[i].substr(PrefixSize);
+      size_t StartColumn = (i == 0) ? CurrentIndent : 0;
+      splitLineInComment(Tok.FormatTok, Line, StartColumn, NewPrefix,
+                         InPPDirective, i != Lines.size() - 1);
     }
   }
 
@@ -227,9 +317,8 @@
     return std::string(NewLines, '\n') + std::string(Spaces, ' ');
   }
 
-  std::string
-  getNewLineText(unsigned NewLines, unsigned Spaces,
-                 unsigned WhitespaceStartColumn, const FormatStyle &Style) {
+  std::string getNewLineText(unsigned NewLines, unsigned Spaces,
+                             unsigned WhitespaceStartColumn) {
     std::string NewLineText;
     if (NewLines > 0) {
       unsigned Offset =
@@ -298,6 +387,7 @@
 
   SourceManager &SourceMgr;
   tooling::Replacements Replaces;
+  const FormatStyle &Style;
 };
 
 class UnwrappedLineFormatter {
@@ -576,10 +666,10 @@
                                           Style.MaxEmptyLinesToKeep + 1));
         if (!Line.InPPDirective)
           Whitespaces.replaceWhitespace(Current, NewLines, State.Column,
-                                        WhitespaceStartColumn, Style);
+                                        WhitespaceStartColumn);
         else
           Whitespaces.replacePPWhitespace(Current, NewLines, State.Column,
-                                          WhitespaceStartColumn, Style);
+                                          WhitespaceStartColumn);
       }
 
       State.Stack.back().LastSpace = State.Column;
@@ -614,7 +704,7 @@
       unsigned Spaces = State.NextToken->SpacesRequiredBefore;
 
       if (!DryRun)
-        Whitespaces.replaceWhitespace(Current, 0, Spaces, State.Column, Style);
+        Whitespaces.replaceWhitespace(Current, 0, Spaces, State.Column);
 
       if (Current.Type == TT_ObjCSelectorName &&
           State.Stack.back().ColonPos == 0) {
@@ -756,7 +846,8 @@
     if (Current.isNot(tok::string_literal))
       return 0;
     // Only break up default narrow strings.
-    if (StringRef(Current.FormatTok.Tok.getLiteralData()).find('"') != 0)
+    const char *LiteralData = Current.FormatTok.Tok.getLiteralData();
+    if (!LiteralData || *LiteralData != '"')
       return 0;
 
     unsigned Penalty = 0;
@@ -765,8 +856,7 @@
     unsigned StartColumn = State.Column - Current.FormatTok.TokenLength;
     unsigned OffsetFromStart = 0;
     while (StartColumn + TailLength > getColumnLimit()) {
-      StringRef Text = StringRef(
-          Current.FormatTok.Tok.getLiteralData() + TailOffset, TailLength);
+      StringRef Text = StringRef(LiteralData + TailOffset, TailLength);
       if (StartColumn + OffsetFromStart + 1 > getColumnLimit())
         break;
       StringRef::size_type SplitPoint = getSplitPoint(
@@ -780,9 +870,9 @@
           StartColumn + OffsetFromStart + SplitPoint + 2;
       State.Stack.back().LastSpace = StartColumn;
       if (!DryRun) {
-        Whitespaces.breakToken(Current, TailOffset + SplitPoint + 1, "\"", "\"",
-                               Line.InPPDirective, StartColumn,
-                               WhitespaceStartColumn, Style);
+        Whitespaces.breakToken(Current.FormatTok, TailOffset + SplitPoint + 1,
+                               0, "\"", "\"", Line.InPPDirective, StartColumn,
+                               WhitespaceStartColumn);
       }
       TailOffset += SplitPoint + 1;
       TailLength -= SplitPoint + 1;
@@ -1157,7 +1247,7 @@
             SourceManager &SourceMgr,
             const std::vector<CharSourceRange> &Ranges)
       : Diag(Diag), Style(Style), Lex(Lex), SourceMgr(SourceMgr),
-        Whitespaces(SourceMgr), Ranges(Ranges) {}
+        Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
 
   virtual ~Formatter() {}
 
@@ -1199,7 +1289,7 @@
         if (PreviousLineWasTouched) {
           unsigned NewLines = std::min(FirstTok.NewlinesBefore, 1u);
           Whitespaces.replaceWhitespace(TheLine.First, NewLines, /*Indent*/ 0,
-                                        /*WhitespaceStartColumn*/ 0, Style);
+                                        /*WhitespaceStartColumn*/ 0);
         }
       } else if (TheLine.Type != LT_Invalid &&
                  (WasMoved || touchesLine(TheLine))) {
@@ -1503,10 +1593,10 @@
       Newlines = 1;
 
     if (!InPPDirective || Tok.HasUnescapedNewline) {
-      Whitespaces.replaceWhitespace(RootToken, Newlines, Indent, 0, Style);
+      Whitespaces.replaceWhitespace(RootToken, Newlines, Indent, 0);
     } else {
       Whitespaces.replacePPWhitespace(RootToken, Newlines, Indent,
-                                      PreviousEndOfLineColumn, Style);
+                                      PreviousEndOfLineColumn);
     }
   }
 
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index e6ae81d..a3afbee 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -651,6 +651,134 @@
                    " */"));
 }
 
+TEST_F(FormatTest, SplitsLongLinesInComments) {
+  EXPECT_EQ("/* This is a long\n"
+            "comment that doesn't\n"
+            "fit on one line.  */",
+            format("/* "
+                   "This is a long                                         "
+                   "comment that doesn't                                    "
+                   "fit on one line.  */",
+                   getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("/*\n"
+            "This is a long\n"
+            "comment that doesn't\n"
+            "fit on one line.\n"
+            "*/",
+            format("/*\n"
+                   "This is a long                                         "
+                   "comment that doesn't                                    "
+                   "fit on one line.                                      \n"
+                   "*/", getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("/*\n"
+            " * This is a long\n"
+            " * comment that\n"
+            " * doesn't fit on\n"
+            " * one line.\n"
+            " */",
+            format("/*\n"
+                   " * This is a long "
+                   "   comment that     "
+                   "   doesn't fit on   "
+                   "   one line.                                            \n"
+                   " */", getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("/*\n"
+            " * This_is_a_comment_with_words_that_dont_fit_on_one_line\n"
+            " * so_it_should_be_broken\n"
+            " * wherever_a_space_occurs\n"
+            " */",
+            format("/*\n"
+                   " * This_is_a_comment_with_words_that_dont_fit_on_one_line "
+                   "   so_it_should_be_broken "
+                   "   wherever_a_space_occurs                             \n"
+                   " */",
+                   getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("/*\n"
+            " * This_comment_can_not_be_broken_into_lines\n"
+            " */",
+            format("/*\n"
+                   " * This_comment_can_not_be_broken_into_lines\n"
+                   " */",
+                   getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("{\n"
+            "  /*\n"
+            "  This is another\n"
+            "  long comment that\n"
+            "  doesn't fit on one\n"
+            "  line    1234567890\n"
+            "  */\n"
+            "}",
+            format("{\n"
+                   "/*\n"
+                   "This is another     "
+                   "  long comment that "
+                   "  doesn't fit on one"
+                   "  line    1234567890\n"
+                   "*/\n"
+                   "}", getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("{\n"
+            "  /*\n"
+            "   * This        i s\n"
+            "   * another comment\n"
+            "   * t hat  doesn' t\n"
+            "   * fit on one l i\n"
+            "   * n e\n"
+            "   */\n"
+            "}",
+            format("{\n"
+                   "/*\n"
+                   " * This        i s"
+                   "   another comment"
+                   "   t hat  doesn' t"
+                   "   fit on one l i"
+                   "   n e\n"
+                   " */\n"
+                   "}", getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("/*\n"
+            " * This is a long\n"
+            " * comment that\n"
+            " * doesn't fit on\n"
+            " * one line\n"
+            " */",
+            format("   /*\n"
+                   "    * This is a long comment that doesn't fit on one line\n"
+                   "    */", getLLVMStyleWithColumns(20)));
+}
+
+TEST_F(FormatTest, SplitsLongLinesInCommentsInPreprocessor) {
+  EXPECT_EQ("#define X          \\\n"
+            // FIXME: Backslash should be added here.
+            "  /*\n"
+            "   Macro comment   \\\n"
+            "   with a long     \\\n"
+            "   line            \\\n"
+            // FIXME: We should look at the length of the last line of the token
+            // instead of the full token's length.
+            //"  */               \\\n"
+            "   */\\\n"
+            "  A + B",
+            format("#define X \\\n"
+                   "  /*\n"
+                   "   Macro comment with a long  line\n"
+                   "   */ \\\n"
+                   "  A + B",
+                   getLLVMStyleWithColumns(20)));
+  EXPECT_EQ("#define X          \\\n"
+            "  /* Macro comment \\\n"
+            // FIXME: Indent comment continuations when the comment is a first
+            // token in a line.
+            "with a long  line  \\\n"
+            // FIXME: We should look at the length of the last line of the token
+            // instead of the full token's length.
+            //"*/                 \\\n"
+            "*/\\\n"
+            "  A + B",
+            format("#define X \\\n"
+                   "  /* Macro comment with a long  line */ \\\n"
+                   "  A + B",
+                   getLLVMStyleWithColumns(20)));
+}
+
 TEST_F(FormatTest, CommentsInStaticInitializers) {
   EXPECT_EQ(
       "static SomeType type = { aaaaaaaaaaaaaaaaaaaa, /* comment */\n"