[Doc parsing] Patch to parse Doxygen-supported HTML character 
references to their UTIF-8 encoding. Reviewed offline by Doug.
// rdar://12392215


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173850 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h
index b90414b..6ce084b 100644
--- a/include/clang/AST/CommentLexer.h
+++ b/include/clang/AST/CommentLexer.h
@@ -282,11 +282,18 @@
   /// it stands for (e.g., "<").
   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
 
+  /// Given a Doxygen-supported named character reference (e.g., "&trade;"),
+  /// it returns its UTF8 encoding.
+  StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
+
   /// Given a Unicode codepoint as base-10 integer, return the character.
   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
 
   /// Given a Unicode codepoint as base-16 integer, return the character.
   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
+  
+  /// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
+  StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
 
   void formTokenWithChars(Token &Result, const char *TokEnd,
                           tok::TokenKind Kind) {
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index ff78e8a..f1c23c6 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -34,6 +34,31 @@
 
 } // unnamed namespace
 
+static unsigned getCodePoint(StringRef Name) {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    CodePoint *= 16;
+    const char C = Name[i];
+    assert(isHTMLHexCharacterReferenceCharacter(C));
+    CodePoint += llvm::hexDigitValue(C);
+  }
+  return CodePoint;
+}
+
+StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+  
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+  unsigned CodePoint = getCodePoint(Name);
+  return helperResolveHTMLHexCharacterReference(CodePoint);
+}
+
 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
   return llvm::StringSwitch<StringRef>(Name)
       .Case("amp", "&")
@@ -41,8 +66,154 @@
       .Case("gt", ">")
       .Case("quot", "\"")
       .Case("apos", "\'")
+      .Case("minus", "-")
+      .Case("sim", "~")
       .Default("");
 }
+  
+StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
+  return llvm::StringSwitch<StringRef>(Name)
+  .Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
+  .Case("trade",        helperResolveHTMLHexCharacterReference(0x02122))
+  .Case("reg",  helperResolveHTMLHexCharacterReference(0x000AE))
+  .Case("lt",   helperResolveHTMLHexCharacterReference(0x0003C))
+  .Case("gt",   helperResolveHTMLHexCharacterReference(0x0003C))
+  .Case("amp",  helperResolveHTMLHexCharacterReference(0x00026))
+  .Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
+  .Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
+  .Case("lsquo",        helperResolveHTMLHexCharacterReference(0x02018))
+  .Case("rsquo",        helperResolveHTMLHexCharacterReference(0x02019))
+  .Case("ldquo",        helperResolveHTMLHexCharacterReference(0x0201C))
+  .Case("rdquo",        helperResolveHTMLHexCharacterReference(0x0201D))
+  .Case("ndash",        helperResolveHTMLHexCharacterReference(0x02013))
+  .Case("mdash",        helperResolveHTMLHexCharacterReference(0x02014))
+  .Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
+  .Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
+  .Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
+  .Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
+  .Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
+  .Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
+  .Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
+  .Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
+  .Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
+  .Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
+  .Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
+  .Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
+  .Case("Aacute",       helperResolveHTMLHexCharacterReference(0x000C1))
+  .Case("Eacute",       helperResolveHTMLHexCharacterReference(0x000C9))
+  .Case("Iacute",       helperResolveHTMLHexCharacterReference(0x000CD))
+  .Case("Oacute",       helperResolveHTMLHexCharacterReference(0x000D3))
+  .Case("Uacute",       helperResolveHTMLHexCharacterReference(0x000DA))
+  .Case("Yacute",       helperResolveHTMLHexCharacterReference(0x000DD))
+  .Case("aacute",       helperResolveHTMLHexCharacterReference(0x000E1))
+  .Case("eacute",       helperResolveHTMLHexCharacterReference(0x000E9))
+  .Case("iacute",       helperResolveHTMLHexCharacterReference(0x000ED))
+  .Case("oacute",       helperResolveHTMLHexCharacterReference(0x000F3))
+  .Case("uacute",       helperResolveHTMLHexCharacterReference(0x000FA))
+  .Case("yacute",       helperResolveHTMLHexCharacterReference(0x000FD))
+  .Case("Agrave",       helperResolveHTMLHexCharacterReference(0x000C0))
+  .Case("Egrave",       helperResolveHTMLHexCharacterReference(0x000C8))
+  .Case("Igrave",       helperResolveHTMLHexCharacterReference(0x000CC))
+  .Case("Ograve",       helperResolveHTMLHexCharacterReference(0x000D2))
+  .Case("Ugrave",       helperResolveHTMLHexCharacterReference(0x000D9))
+  .Case("agrave",       helperResolveHTMLHexCharacterReference(0x000E0))
+  .Case("egrave",       helperResolveHTMLHexCharacterReference(0x000E8))
+  .Case("igrave",       helperResolveHTMLHexCharacterReference(0x000EC))
+  .Case("ograve",       helperResolveHTMLHexCharacterReference(0x000F2))
+  .Case("ugrave",       helperResolveHTMLHexCharacterReference(0x000F9))
+  .Case("ygrave",       helperResolveHTMLHexCharacterReference(0x01EF3))
+  .Case("Acirc",        helperResolveHTMLHexCharacterReference(0x000C2))
+  .Case("Ecirc",        helperResolveHTMLHexCharacterReference(0x000CA))
+  .Case("Icirc",        helperResolveHTMLHexCharacterReference(0x000CE))
+  .Case("Ocirc",        helperResolveHTMLHexCharacterReference(0x000D4))
+  .Case("Ucirc",        helperResolveHTMLHexCharacterReference(0x000DB))
+  .Case("acirc",        helperResolveHTMLHexCharacterReference(0x000E2))
+  .Case("ecirc",        helperResolveHTMLHexCharacterReference(0x000EA))
+  .Case("icirc",        helperResolveHTMLHexCharacterReference(0x000EE))
+  .Case("ocirc",        helperResolveHTMLHexCharacterReference(0x000F4))
+  .Case("ucirc",        helperResolveHTMLHexCharacterReference(0x000FB))
+  .Case("ycirc",        helperResolveHTMLHexCharacterReference(0x00177))
+  .Case("Atilde",       helperResolveHTMLHexCharacterReference(0x000C3))
+  .Case("Ntilde",       helperResolveHTMLHexCharacterReference(0x000D1))
+  .Case("Otilde",       helperResolveHTMLHexCharacterReference(0x000D5))
+  .Case("atilde",       helperResolveHTMLHexCharacterReference(0x000E3))
+  .Case("ntilde",       helperResolveHTMLHexCharacterReference(0x000F1))
+  .Case("otilde",       helperResolveHTMLHexCharacterReference(0x000F5))
+  .Case("szlig",        helperResolveHTMLHexCharacterReference(0x000DF))
+  .Case("ccedil",       helperResolveHTMLHexCharacterReference(0x000E7))
+  .Case("Ccedil",       helperResolveHTMLHexCharacterReference(0x000C7))
+  .Case("aring",        helperResolveHTMLHexCharacterReference(0x000E5))
+  .Case("Aring",        helperResolveHTMLHexCharacterReference(0x000C5))
+  .Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
+  .Case("Gamma",        helperResolveHTMLHexCharacterReference(0x00393))
+  .Case("Delta",        helperResolveHTMLHexCharacterReference(0x00394))
+  .Case("Theta",        helperResolveHTMLHexCharacterReference(0x00398))
+  .Case("Lambda",       helperResolveHTMLHexCharacterReference(0x0039B))
+  .Case("Xi",   helperResolveHTMLHexCharacterReference(0x0039E))
+  .Case("Pi",   helperResolveHTMLHexCharacterReference(0x003A0))
+  .Case("Sigma",        helperResolveHTMLHexCharacterReference(0x003A3))
+  .Case("Upsilon",      helperResolveHTMLHexCharacterReference(0x003A5))
+  .Case("Phi",  helperResolveHTMLHexCharacterReference(0x003A6))
+  .Case("Psi",  helperResolveHTMLHexCharacterReference(0x003A8))
+  .Case("Omega",        helperResolveHTMLHexCharacterReference(0x003A9))
+  .Case("alpha",        helperResolveHTMLHexCharacterReference(0x003B1))
+  .Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
+  .Case("gamma",        helperResolveHTMLHexCharacterReference(0x003B3))
+  .Case("delta",        helperResolveHTMLHexCharacterReference(0x003B4))
+  .Case("epsilon",      helperResolveHTMLHexCharacterReference(0x003B5))
+  .Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
+  .Case("eta",  helperResolveHTMLHexCharacterReference(0x003B7))
+  .Case("theta",        helperResolveHTMLHexCharacterReference(0x003B8))
+  .Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
+  .Case("kappa",        helperResolveHTMLHexCharacterReference(0x003BA))
+  .Case("lambda",       helperResolveHTMLHexCharacterReference(0x003BB))
+  .Case("mu",   helperResolveHTMLHexCharacterReference(0x003BC))
+  .Case("nu",   helperResolveHTMLHexCharacterReference(0x003BD))
+  .Case("xi",   helperResolveHTMLHexCharacterReference(0x003BE))
+  .Case("pi",   helperResolveHTMLHexCharacterReference(0x003C0))
+  .Case("rho",  helperResolveHTMLHexCharacterReference(0x003C1))
+  .Case("sigma",        helperResolveHTMLHexCharacterReference(0x003C3))
+  .Case("tau",  helperResolveHTMLHexCharacterReference(0x003C4))
+  .Case("upsilon",      helperResolveHTMLHexCharacterReference(0x003C5))
+  .Case("phi",  helperResolveHTMLHexCharacterReference(0x003C6))
+  .Case("chi",  helperResolveHTMLHexCharacterReference(0x003C7))
+  .Case("psi",  helperResolveHTMLHexCharacterReference(0x003C8))
+  .Case("omega",        helperResolveHTMLHexCharacterReference(0x003C9))
+  .Case("sigmaf",       helperResolveHTMLHexCharacterReference(0x003C2))
+  .Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
+  .Case("deg",  helperResolveHTMLHexCharacterReference(0x000B0))
+  .Case("prime",        helperResolveHTMLHexCharacterReference(0x02032))
+  .Case("Prime",        helperResolveHTMLHexCharacterReference(0x02033))
+  .Case("infin",        helperResolveHTMLHexCharacterReference(0x0221E))
+  .Case("empty",        helperResolveHTMLHexCharacterReference(0x02205))
+  .Case("plusmn",       helperResolveHTMLHexCharacterReference(0x000B1))
+  .Case("times",        helperResolveHTMLHexCharacterReference(0x000D7))
+  .Case("minus",        helperResolveHTMLHexCharacterReference(0x02212))
+  .Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
+  .Case("part", helperResolveHTMLHexCharacterReference(0x02202))
+  .Case("nabla",        helperResolveHTMLHexCharacterReference(0x02207))
+  .Case("radic",        helperResolveHTMLHexCharacterReference(0x0221A))
+  .Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
+  .Case("sum",  helperResolveHTMLHexCharacterReference(0x02211))
+  .Case("int",  helperResolveHTMLHexCharacterReference(0x0222B))
+  .Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
+  .Case("sim",  helperResolveHTMLHexCharacterReference(0x0223C))
+  .Case("asymp",        helperResolveHTMLHexCharacterReference(0x02248))
+  .Case("ne",   helperResolveHTMLHexCharacterReference(0x02260))
+  .Case("equiv",        helperResolveHTMLHexCharacterReference(0x02261))
+  .Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
+  .Case("le",   helperResolveHTMLHexCharacterReference(0x02264))
+  .Case("ge",   helperResolveHTMLHexCharacterReference(0x02265))
+  .Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
+  .Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
+  .Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
+  .Case("notin",        helperResolveHTMLHexCharacterReference(0x02209))
+  .Case("lceil",        helperResolveHTMLHexCharacterReference(0x02308))
+  .Case("rceil",        helperResolveHTMLHexCharacterReference(0x02309))
+  .Case("lfloor",       helperResolveHTMLHexCharacterReference(0x0230A))
+  .Case("rfloor",       helperResolveHTMLHexCharacterReference(0x0230B))
+  .Default("");
+}
 
 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
   unsigned CodePoint = 0;
@@ -60,23 +231,6 @@
     return StringRef();
 }
 
-StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
-  unsigned CodePoint = 0;
-  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-    CodePoint *= 16;
-    const char C = Name[i];
-    assert(isHTMLHexCharacterReferenceCharacter(C));
-    CodePoint += llvm::hexDigitValue(C);
-  }
-
-  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
-  char *ResolvedPtr = Resolved;
-  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
-    return StringRef(Resolved, ResolvedPtr - Resolved);
-  else
-    return StringRef();
-}
-
 void Lexer::skipLineStartingDecorations() {
   // This function should be called only for C comments
   assert(CommentState == LCS_InsideCComment);
@@ -573,8 +727,17 @@
   StringRef Name(NamePtr, TokenPtr - NamePtr);
   TokenPtr++; // Skip semicolon.
   StringRef Resolved;
-  if (isNamed)
+  if (isNamed) {
     Resolved = resolveHTMLNamedCharacterReference(Name);
+    if (Resolved.empty()) {
+      Resolved = HTMLDoxygenCharacterReference(Name);
+      if (!Resolved.empty()) {
+        formTokenWithChars(T, TokenPtr, tok::text);
+        T.setText(Resolved);
+        return;
+      }
+    }
+  }
   else if (isDecimal)
     Resolved = resolveHTMLDecimalCharacterReference(Name);
   else
diff --git a/test/Index/special-html-characters.m b/test/Index/special-html-characters.m
new file mode 100644
index 0000000..74106f3
--- /dev/null
+++ b/test/Index/special-html-characters.m
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: c-index-test -test-load-source all -comments-xml-schema=%S/../../bindings/xml/comment-xml-schema.rng -triple x86_64-apple-darwin10 %s > %t/out
+// RUN: FileCheck %s < %t/out
+// rdar://13067629
+
+// Ensure that XML we generate is not invalid.
+// RUN: FileCheck %s -check-prefix=WRONG < %t/out
+// WRONG-NOT: CommentXMLInvalid
+
+// rdar://12392215
+@interface I
+@end
+
+@implementation I
+/*!
+	&copy; the copyright symbol
+	&trade; the trade mark symbol
+        &reg; the registered trade mark symbol
+	&nbsp; a non breakable space.
+        &Delta; Greek letter Delta Δ.
+        &Gamma; Greek letter Gamma Γ.
+ */
+- (void)phoneHome:(id)sender {
+
+}
+@end
+// CHECK: FullCommentAsHTML=[<p class="para-brief">\t© the copyright symbol\t™ the trade mark symbol        ® the registered trade mark symbol\t  a non breakable space.        Δ Greek letter Delta Δ.        Γ Greek letter Gamma Γ. </p>] FullCommentAsXML=[<Function isInstanceMethod="1" file="{{[^"]+}}special-html-characters.m" line="[[@LINE-4]]" column="1"><Name>phoneHome:</Name><USR>c:objc(cs)I(im)phoneHome:</USR><Declaration>- (void)phoneHome:(id)sender;</Declaration><Abstract><Para>\t© the copyright symbol\t™ the trade mark symbol        ® the registered trade mark symbol\t  a non breakable space.        Δ Greek letter Delta Δ.        Γ Greek letter Gamma Γ. </Para></Abstract></Function>]