Preprocessor: preserve whitespace in -traditional-cpp mode.

Note that unlike GNU cpp we currently do not preserve whitespace in macros
(even in -traditional-cpp mode).

<rdar://problem/12897179>

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@175778 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h
index 535baf5..57e6c92 100644
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -174,8 +174,8 @@
   /// SetKeepWhitespaceMode - This method lets clients enable or disable
   /// whitespace retention mode.
   void SetKeepWhitespaceMode(bool Val) {
-    assert((!Val || LexingRawMode) &&
-           "Can only enable whitespace retention in raw mode");
+    assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
+           "Can only retain whitespace in raw mode or -traditional-cpp");
     ExtendedTokenMode = Val ? 2 : 0;
   }
 
@@ -194,6 +194,14 @@
     ExtendedTokenMode = Mode ? 1 : 0;
   }
 
+  /// Sets the extended token mode back to its initial value, according to the
+  /// language options and preprocessor. This controls whether the lexer
+  /// produces comment and whitespace tokens.
+  ///
+  /// This requires the lexer to have an associated preprocessor. A standalone
+  /// lexer has nothing to reset to.
+  void resetExtendedTokenMode();
+
   const char *getBufferStart() const { return BufferStart; }
 
   /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
diff --git a/lib/Frontend/PrintPreprocessedOutput.cpp b/lib/Frontend/PrintPreprocessedOutput.cpp
index c85945b..3d55adc 100644
--- a/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -548,7 +548,7 @@
 
       // Tokens that can contain embedded newlines need to adjust our current
       // line number.
-      if (Tok.getKind() == tok::comment)
+      if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
         Callbacks->HandleNewlinesInToken(TokPtr, Len);
     } else {
       std::string S = PP.getSpelling(Tok);
@@ -556,7 +556,7 @@
 
       // Tokens that can contain embedded newlines need to adjust our current
       // line number.
-      if (Tok.getKind() == tok::comment)
+      if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
         Callbacks->HandleNewlinesInToken(&S[0], S.size());
     }
     Callbacks->setEmittedTokensOnThisLine();
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 0590d9e..65ea5e3 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -122,8 +122,15 @@
   InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
             InputFile->getBufferEnd());
 
-  // Default to keeping comments if the preprocessor wants them.
-  SetCommentRetentionState(PP.getCommentRetentionState());
+  resetExtendedTokenMode();
+}
+
+void Lexer::resetExtendedTokenMode() {
+  assert(PP && "Cannot reset token mode without a preprocessor");
+  if (LangOpts.TraditionalCPP)
+    SetKeepWhitespaceMode(true);
+  else
+    SetCommentRetentionState(PP->getCommentRetentionState());
 }
 
 /// Lexer constructor - Create a new raw lexer object.  This object is only
@@ -1844,6 +1851,8 @@
 ///
 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
   // Whitespace - Skip it, then return the token after the whitespace.
+  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
+
   unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
   while (1) {
     // Skip horizontal whitespace very aggressively.
@@ -1851,7 +1860,7 @@
       Char = *++CurPtr;
 
     // Otherwise if we have something other than whitespace, we're done.
-    if (Char != '\n' && Char != '\r')
+    if (!isVerticalWhitespace(Char))
       break;
 
     if (ParsingPreprocessorDirective) {
@@ -1861,24 +1870,27 @@
     }
 
     // ok, but handle newline.
-    // The returned token is at the start of the line.
-    Result.setFlag(Token::StartOfLine);
-    // No leading whitespace seen so far.
-    Result.clearFlag(Token::LeadingSpace);
+    SawNewline = true;
     Char = *++CurPtr;
   }
 
-  // If this isn't immediately after a newline, there is leading space.
-  char PrevChar = CurPtr[-1];
-  if (PrevChar != '\n' && PrevChar != '\r')
-    Result.setFlag(Token::LeadingSpace);
-
   // If the client wants us to return whitespace, return it now.
   if (isKeepWhitespaceMode()) {
     FormTokenWithChars(Result, CurPtr, tok::unknown);
+    if (SawNewline)
+      IsAtStartOfLine = true;
+    // FIXME: The next token will not have LeadingSpace set.
     return true;
   }
 
+  // If this isn't immediately after a newline, there is leading space.
+  char PrevChar = CurPtr[-1];
+  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
+
+  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
+  if (SawNewline)
+    Result.setFlag(Token::StartOfLine);
+
   BufferPtr = CurPtr;
   return false;
 }
@@ -2269,7 +2281,6 @@
   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
   // have already returned above with the comment as a token.
   if (isHorizontalWhitespace(*CurPtr)) {
-    Result.setFlag(Token::LeadingSpace);
     SkipWhitespace(Result, CurPtr+1);
     return false;
   }
@@ -2351,7 +2362,7 @@
     FormTokenWithChars(Result, CurPtr, tok::eod);
 
     // Restore comment saving mode, in case it was disabled for directive.
-    SetCommentRetentionState(PP->getCommentRetentionState());
+    resetExtendedTokenMode();
     return true;  // Have a token.
   }
  
@@ -2718,6 +2729,7 @@
     // whitespace.
     if (isKeepWhitespaceMode()) {
       FormTokenWithChars(Result, CurPtr, tok::unknown);
+      // FIXME: The next token will not have LeadingSpace set.
       return;
     }
 
@@ -2785,7 +2797,7 @@
 
       // Restore comment saving mode, in case it was disabled for directive.
       if (PP)
-        SetCommentRetentionState(PP->getCommentRetentionState());
+        resetExtendedTokenMode();
 
       // Since we consumed a newline, we are back at the start of a line.
       IsAtStartOfLine = true;
@@ -2793,8 +2805,7 @@
       Kind = tok::eod;
       break;
     }
-    // The returned token is at the start of the line.
-    Result.setFlag(Token::StartOfLine);
+
     // No leading whitespace seen so far.
     Result.clearFlag(Token::LeadingSpace);
 
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 1825028..54457c3 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -269,7 +269,7 @@
     if (Tok.isNot(tok::raw_identifier)) {
       CurPPLexer->ParsingPreprocessorDirective = false;
       // Restore comment saving mode.
-      if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+      if (CurLexer) CurLexer->resetExtendedTokenMode();
       continue;
     }
 
@@ -285,7 +285,7 @@
         FirstChar != 'i' && FirstChar != 'e') {
       CurPPLexer->ParsingPreprocessorDirective = false;
       // Restore comment saving mode.
-      if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+      if (CurLexer) CurLexer->resetExtendedTokenMode();
       continue;
     }
 
@@ -302,7 +302,7 @@
       if (IdLen >= 20) {
         CurPPLexer->ParsingPreprocessorDirective = false;
         // Restore comment saving mode.
-        if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+        if (CurLexer) CurLexer->resetExtendedTokenMode();
         continue;
       }
       memcpy(DirectiveBuf, &DirectiveStr[0], IdLen);
@@ -408,7 +408,7 @@
 
     CurPPLexer->ParsingPreprocessorDirective = false;
     // Restore comment saving mode.
-    if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+    if (CurLexer) CurLexer->resetExtendedTokenMode();
   }
 
   // Finally, if we are out of the conditional (saw an #endif or ran off the end
@@ -594,6 +594,7 @@
   // mode.  Tell the lexer this so any newlines we see will be converted into an
   // EOD token (which terminates the directive).
   CurPPLexer->ParsingPreprocessorDirective = true;
+  if (CurLexer) CurLexer->SetKeepWhitespaceMode(false);
 
   ++NumDirectives;
 
@@ -638,14 +639,9 @@
   // and reset to previous state when returning from this function.
   ResetMacroExpansionHelper helper(this);
 
-TryAgain:
   switch (Result.getKind()) {
   case tok::eod:
     return;   // null directive.
-  case tok::comment:
-    // Handle stuff like "# /*foo*/ define X" in -E -C mode.
-    LexUnexpandedToken(Result);
-    goto TryAgain;
   case tok::code_completion:
     if (CodeComplete)
       CodeComplete->CodeCompleteDirective(
diff --git a/test/Preprocessor/traditional-cpp.c b/test/Preprocessor/traditional-cpp.c
index 5fc9ee3..7202454 100644
--- a/test/Preprocessor/traditional-cpp.c
+++ b/test/Preprocessor/traditional-cpp.c
@@ -4,9 +4,61 @@
 
 /*
  RUN: %clang_cc1 -traditional-cpp %s -E -o %t
- RUN: FileCheck < %t %s
+ RUN: FileCheck -strict-whitespace < %t %s
 */
 
-/* CHECK: foo // bar
+/* CHECK: {{^}}foo // bar{{$}}
  */
 foo // bar
+
+
+/* The lines in this file contain hard tab characters and trailing whitespace; 
+ * do not change them! */
+
+/* CHECK: {{^}}	indented!{{$}}
+ * CHECK: {{^}}tab	separated	values{{$}}
+ */
+	indented!
+tab	separated	values
+
+#define bracket(x) >>>x<<<
+bracket(|  spaces  |)
+/* CHECK: {{^}}>>>|  spaces  |<<<{{$}}
+ */
+
+/* This is still a preprocessing directive. */
+# define foo bar
+foo!
+-
+	foo!	foo!	
+/* CHECK: {{^}}bar!{{$}}
+ * CHECK: {{^}}	bar!	bar!	{{$}}
+ */
+
+/* Deliberately check a leading newline with spaces on that line. */
+   
+# define foo bar
+foo!
+-
+	foo!	foo!	
+/* CHECK: {{^}}bar!{{$}}
+ * CHECK: {{^}}	bar!	bar!	{{$}}
+ */
+
+/* FIXME: -traditional-cpp should not consider this a preprocessing directive
+ * because the # isn't in the first column.
+ */
+ #define foo2 bar
+foo2!
+/* If this were working, both of these checks would be on.
+ * CHECK-NOT: {{^}} #define foo2 bar{{$}}
+ * CHECK-NOT: {{^}}foo2!{{$}}
+ */
+
+/* FIXME: -traditional-cpp should not homogenize whitespace in macros.
+ */
+#define bracket2(x) >>>  x  <<<
+bracket2(spaces)
+/* If this were working, this check would be on.
+ * CHECK-NOT: {{^}}>>>  spaces  <<<{{$}}
+ */