| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
| * in compliance with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software distributed under the License |
| * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
| * or implied. See the License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| |
| /* |
| * This was copied from https://github.com/google/google-java-format |
| * Modifications: |
| * 1. The package name and imports were changed to com.facebook.ktfmt.kdoc to compile more easily. |
| */ |
| |
| package com.facebook.ktfmt.kdoc; |
| |
| import static com.facebook.ktfmt.kdoc.Token.Type.BEGIN_JAVADOC; |
| import static com.facebook.ktfmt.kdoc.Token.Type.BLOCKQUOTE_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.BLOCKQUOTE_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.BR_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.CODE_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.CODE_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.END_JAVADOC; |
| import static com.facebook.ktfmt.kdoc.Token.Type.FOOTER_JAVADOC_TAG_START; |
| import static com.facebook.ktfmt.kdoc.Token.Type.FORCED_NEWLINE; |
| import static com.facebook.ktfmt.kdoc.Token.Type.HEADER_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.HEADER_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.HTML_COMMENT; |
| import static com.facebook.ktfmt.kdoc.Token.Type.LIST_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.LIST_ITEM_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.LIST_ITEM_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.LIST_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.LITERAL; |
| import static com.facebook.ktfmt.kdoc.Token.Type.MOE_BEGIN_STRIP_COMMENT; |
| import static com.facebook.ktfmt.kdoc.Token.Type.MOE_END_STRIP_COMMENT; |
| import static com.facebook.ktfmt.kdoc.Token.Type.OPTIONAL_LINE_BREAK; |
| import static com.facebook.ktfmt.kdoc.Token.Type.PARAGRAPH_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.PARAGRAPH_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.PRE_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.PRE_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.TABLE_CLOSE_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.TABLE_OPEN_TAG; |
| import static com.facebook.ktfmt.kdoc.Token.Type.WHITESPACE; |
| import static com.google.common.base.Preconditions.checkArgument; |
| import static com.google.common.base.Preconditions.checkNotNull; |
| import static com.google.common.base.Verify.verify; |
| import static com.google.common.collect.Iterators.peekingIterator; |
| import static java.lang.String.format; |
| import static java.util.regex.Pattern.CASE_INSENSITIVE; |
| import static java.util.regex.Pattern.DOTALL; |
| import static java.util.regex.Pattern.compile; |
| |
| import com.facebook.ktfmt.kdoc.Token.Type; |
| import com.google.common.base.CharMatcher; |
| import com.google.common.collect.ImmutableList; |
| import com.google.common.collect.PeekingIterator; |
| import java.util.ArrayDeque; |
| import java.util.ArrayList; |
| import java.util.Deque; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| /** Lexer for the Javadoc formatter. */ |
| final class JavadocLexer { |
| /** Takes a Javadoc comment, including ∕✱✱ and ✱∕, and returns tokens, including ∕✱✱ and ✱∕. */ |
| static ImmutableList<Token> lex(String input) throws LexException { |
| /* |
| * TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their |
| * original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre> tag, |
| * so we'll probably never bother. |
| */ |
| input = stripJavadocBeginAndEnd(input); |
| input = normalizeLineEndings(input); |
| return new JavadocLexer(new CharStream(input)).generateTokens(); |
| } |
| |
| /** The lexer crashes on windows line endings, so for now just normalize to `\n`. */ |
| // TODO(cushon): use the platform line separator for output |
| private static String normalizeLineEndings(String input) { |
| return NON_UNIX_LINE_ENDING.matcher(input).replaceAll("\n"); |
| } |
| |
| private static final Pattern NON_UNIX_LINE_ENDING = Pattern.compile("\r\n?"); |
| |
| private static String stripJavadocBeginAndEnd(String input) { |
| /* |
| * We do this ahead of time so that the main part of the lexer need not say things like |
| * "(?![*]/)" to avoid accidentally swallowing ✱∕ when consuming a newline. |
| */ |
| checkArgument(input.startsWith("/**"), "Missing /**: %s", input); |
| checkArgument(input.endsWith("*/") && input.length() > 4, "Missing */: %s", input); |
| return input.substring("/**".length(), input.length() - "*/".length()); |
| } |
| |
| private final CharStream input; |
| private final NestingCounter braceDepth = new NestingCounter(); |
| private final NestingCounter preDepth = new NestingCounter(); |
| private final NestingCounter codeDepth = new NestingCounter(); |
| private final NestingCounter tableDepth = new NestingCounter(); |
| private boolean somethingSinceNewline; |
| |
| private JavadocLexer(CharStream input) { |
| this.input = checkNotNull(input); |
| } |
| |
| private ImmutableList<Token> generateTokens() throws LexException { |
| ImmutableList.Builder<Token> tokens = ImmutableList.builder(); |
| |
| Token token = new Token(BEGIN_JAVADOC, "/**"); |
| tokens.add(token); |
| |
| while (!input.isExhausted()) { |
| token = readToken(); |
| tokens.add(token); |
| } |
| |
| checkMatchingTags(); |
| |
| token = new Token(END_JAVADOC, "*/"); |
| tokens.add(token); |
| |
| ImmutableList<Token> result = tokens.build(); |
| result = joinAdjacentLiteralsAndAdjacentWhitespace(result); |
| result = inferParagraphTags(result); |
| result = optionalizeSpacesAfterLinks(result); |
| result = deindentPreCodeBlocks(result); |
| return result; |
| } |
| |
| private Token readToken() throws LexException { |
| Type type = consumeToken(); |
| String value = input.readAndResetRecorded(); |
| return new Token(type, value); |
| } |
| |
| private Type consumeToken() throws LexException { |
| boolean preserveExistingFormatting = preserveExistingFormatting(); |
| |
| if (input.tryConsumeRegex(NEWLINE_PATTERN)) { |
| somethingSinceNewline = false; |
| return preserveExistingFormatting ? FORCED_NEWLINE : WHITESPACE; |
| } else if (input.tryConsume(" ") || input.tryConsume("\t")) { |
| // TODO(cpovirk): How about weird whitespace chars? Ideally we'd distinguish breaking vs. not. |
| // Returning LITERAL here prevent us from breaking a <pre> line. For more info, see LITERAL. |
| return preserveExistingFormatting ? LITERAL : WHITESPACE; |
| } |
| |
| /* |
| * TODO(cpovirk): Maybe try to detect things like "{@code\n@GwtCompatible}" that aren't intended |
| * as tags. But in the most likely case, in which that happens inside <pre>{@code, we have no |
| * great options for fixing it. |
| * https://github.com/google/google-java-format/issues/7#issuecomment-197383926 |
| */ |
| if (!somethingSinceNewline && input.tryConsumeRegex(FOOTER_TAG_PATTERN)) { |
| checkMatchingTags(); |
| somethingSinceNewline = true; |
| return FOOTER_JAVADOC_TAG_START; |
| } |
| somethingSinceNewline = true; |
| |
| if (input.tryConsumeRegex(INLINE_TAG_OPEN_PATTERN)) { |
| braceDepth.increment(); |
| return LITERAL; |
| } else if (input.tryConsume("{")) { |
| braceDepth.incrementIfPositive(); |
| return LITERAL; |
| } else if (input.tryConsume("}")) { |
| braceDepth.decrementIfPositive(); |
| return LITERAL; |
| } |
| |
| // Inside an inline tag, don't do any HTML interpretation. |
| if (braceDepth.isPositive()) { |
| verify(input.tryConsumeRegex(LITERAL_PATTERN)); |
| return LITERAL; |
| } |
| |
| if (input.tryConsumeRegex(PRE_OPEN_PATTERN)) { |
| preDepth.increment(); |
| return preserveExistingFormatting ? LITERAL : PRE_OPEN_TAG; |
| } else if (input.tryConsumeRegex(PRE_CLOSE_PATTERN)) { |
| preDepth.decrementIfPositive(); |
| return preserveExistingFormatting() ? LITERAL : PRE_CLOSE_TAG; |
| } |
| |
| if (input.tryConsumeRegex(CODE_OPEN_PATTERN)) { |
| codeDepth.increment(); |
| return preserveExistingFormatting ? LITERAL : CODE_OPEN_TAG; |
| } else if (input.tryConsumeRegex(CODE_CLOSE_PATTERN)) { |
| codeDepth.decrementIfPositive(); |
| return preserveExistingFormatting() ? LITERAL : CODE_CLOSE_TAG; |
| } |
| |
| if (input.tryConsumeRegex(TABLE_OPEN_PATTERN)) { |
| tableDepth.increment(); |
| return preserveExistingFormatting ? LITERAL : TABLE_OPEN_TAG; |
| } else if (input.tryConsumeRegex(TABLE_CLOSE_PATTERN)) { |
| tableDepth.decrementIfPositive(); |
| return preserveExistingFormatting() ? LITERAL : TABLE_CLOSE_TAG; |
| } |
| |
| if (preserveExistingFormatting) { |
| verify(input.tryConsumeRegex(LITERAL_PATTERN)); |
| return LITERAL; |
| } |
| |
| if (input.tryConsumeRegex(PARAGRAPH_OPEN_PATTERN)) { |
| return PARAGRAPH_OPEN_TAG; |
| } else if (input.tryConsumeRegex(PARAGRAPH_CLOSE_PATTERN)) { |
| return PARAGRAPH_CLOSE_TAG; |
| } else if (input.tryConsumeRegex(LIST_OPEN_PATTERN)) { |
| return LIST_OPEN_TAG; |
| } else if (input.tryConsumeRegex(LIST_CLOSE_PATTERN)) { |
| return LIST_CLOSE_TAG; |
| } else if (input.tryConsumeRegex(LIST_ITEM_OPEN_PATTERN)) { |
| return LIST_ITEM_OPEN_TAG; |
| } else if (input.tryConsumeRegex(LIST_ITEM_CLOSE_PATTERN)) { |
| return LIST_ITEM_CLOSE_TAG; |
| } else if (input.tryConsumeRegex(BLOCKQUOTE_OPEN_PATTERN)) { |
| return BLOCKQUOTE_OPEN_TAG; |
| } else if (input.tryConsumeRegex(BLOCKQUOTE_CLOSE_PATTERN)) { |
| return BLOCKQUOTE_CLOSE_TAG; |
| } else if (input.tryConsumeRegex(HEADER_OPEN_PATTERN)) { |
| return HEADER_OPEN_TAG; |
| } else if (input.tryConsumeRegex(HEADER_CLOSE_PATTERN)) { |
| return HEADER_CLOSE_TAG; |
| } else if (input.tryConsumeRegex(BR_PATTERN)) { |
| return BR_TAG; |
| } else if (input.tryConsumeRegex(MOE_BEGIN_STRIP_COMMENT_PATTERN)) { |
| return MOE_BEGIN_STRIP_COMMENT; |
| } else if (input.tryConsumeRegex(MOE_END_STRIP_COMMENT_PATTERN)) { |
| return MOE_END_STRIP_COMMENT; |
| } else if (input.tryConsumeRegex(HTML_COMMENT_PATTERN)) { |
| return HTML_COMMENT; |
| } else if (input.tryConsumeRegex(LITERAL_PATTERN)) { |
| return LITERAL; |
| } |
| throw new AssertionError(); |
| } |
| |
| private boolean preserveExistingFormatting() { |
| return preDepth.isPositive() || tableDepth.isPositive() || codeDepth.isPositive(); |
| } |
| |
| private void checkMatchingTags() throws LexException { |
| if (braceDepth.isPositive() |
| || preDepth.isPositive() |
| || tableDepth.isPositive() |
| || codeDepth.isPositive()) { |
| throw new LexException(); |
| } |
| } |
| |
| /** |
| * Join together adjacent literal tokens, and join together adjacent whitespace tokens. |
| * |
| * <p>For literal tokens, this means something like {@code ["<b>", "foo", "</b>"] => |
| * ["<b>foo</b>"]}. See {@link #LITERAL_PATTERN} for discussion of why those tokens are separate |
| * to begin with. |
| * |
| * <p>Whitespace tokens are treated analogously. We don't really "want" to join whitespace tokens, |
| * but in the course of joining literals, we incidentally join whitespace, too. We do take |
| * advantage of the joining later on: It simplifies {@link #inferParagraphTags}. |
| * |
| * <p>Note that we do <i>not</i> merge a literal token and a whitespace token together. |
| */ |
| private static ImmutableList<Token> joinAdjacentLiteralsAndAdjacentWhitespace(List<Token> input) { |
| /* |
| * Note: Our final token is always END_JAVADOC. This saves us some trouble: |
| * |
| * - Our inner while() doesn't need a hasNext() check. |
| * |
| * - We don't need to check for leftover accumulated literals after we exit the loop. |
| */ |
| ImmutableList.Builder<Token> output = ImmutableList.builder(); |
| StringBuilder accumulated = new StringBuilder(); |
| |
| for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { |
| if (tokens.peek().getType() == LITERAL) { |
| accumulated.append(tokens.peek().getValue()); |
| tokens.next(); |
| continue; |
| } |
| |
| /* |
| * IF we have accumulated some literals to join together (say, "foo<b>bar</b>"), and IF we'll |
| * next see whitespace followed by a "@" literal, we need to join that together with the |
| * previous literals. That ensures that we won't insert a line break before the "@," turning |
| * it into a tag. |
| */ |
| |
| if (accumulated.length() == 0) { |
| output.add(tokens.peek()); |
| tokens.next(); |
| continue; |
| } |
| |
| StringBuilder seenWhitespace = new StringBuilder(); |
| while (tokens.peek().getType() == WHITESPACE) { |
| seenWhitespace.append(tokens.next().getValue()); |
| } |
| |
| if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().startsWith("@")) { |
| // OK, we're in the case described above. |
| accumulated.append(" "); |
| accumulated.append(tokens.peek().getValue()); |
| tokens.next(); |
| continue; |
| } |
| |
| output.add(new Token(LITERAL, accumulated.toString())); |
| accumulated.setLength(0); |
| |
| if (seenWhitespace.length() > 0) { |
| output.add(new Token(WHITESPACE, seenWhitespace.toString())); |
| } |
| |
| // We have another token coming, possibly of type OTHER. Leave it for the next iteration. |
| } |
| |
| /* |
| * TODO(cpovirk): Another case where we could try to join tokens is if a line ends with |
| * /[^ -]-/, as in "non-\nblocking." |
| */ |
| return output.build(); |
| } |
| |
| /** |
| * Where the input has two consecutive line breaks between literals, insert a {@code <p>} tag |
| * between the literals. |
| * |
| * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it |
| * assumes that adjacent whitespace tokens have already been joined. |
| */ |
| private static ImmutableList<Token> inferParagraphTags(List<Token> input) { |
| ImmutableList.Builder<Token> output = ImmutableList.builder(); |
| |
| for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { |
| if (tokens.peek().getType() == LITERAL) { |
| output.add(tokens.next()); |
| |
| if (tokens.peek().getType() == WHITESPACE |
| && hasMultipleNewlines(tokens.peek().getValue())) { |
| output.add(tokens.next()); |
| |
| if (tokens.peek().getType() == LITERAL) { |
| output.add(new Token(PARAGRAPH_OPEN_TAG, "")); |
| } |
| } |
| } else { |
| // TODO(cpovirk): Or just `continue` from the <p> case and move this out of the `else`? |
| output.add(tokens.next()); |
| } |
| } |
| |
| return output.build(); |
| |
| /* |
| * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that |
| * right without special effort on our part. The reason: Line breaks inside a <pre> section are |
| * of type FORCED_NEWLINE rather than WHITESPACE. |
| */ |
| } |
| |
| /** |
| * Replaces whitespace after a {@code href=...>} token with an "optional link break." This allows |
| * us to output either {@code <a href=foo>foo</a>} or {@code <a href=foo>\nfoo</a>}, depending on |
| * how much space we have left on the line. |
| * |
| * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it |
| * assumes that adjacent whitespace tokens have already been joined. |
| */ |
| private static ImmutableList<Token> optionalizeSpacesAfterLinks(List<Token> input) { |
| ImmutableList.Builder<Token> output = ImmutableList.builder(); |
| |
| for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { |
| if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().matches("^href=[^>]*>")) { |
| output.add(tokens.next()); |
| |
| if (tokens.peek().getType() == WHITESPACE) { |
| output.add(new Token(OPTIONAL_LINE_BREAK, tokens.next().getValue())); |
| } |
| } else { |
| output.add(tokens.next()); |
| } |
| } |
| |
| return output.build(); |
| |
| /* |
| * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that |
| * right without special effort on our part. The reason: Line breaks inside a <pre> section are |
| * of type FORCED_NEWLINE rather than WHITESPACE. |
| */ |
| } |
| |
| /** |
| * Adjust indentation inside `<pre>{@code` blocks. |
| * |
| * <p>Also trim leading and trailing blank lines, and move the trailing `}` to its own line. |
| */ |
| private static ImmutableList<Token> deindentPreCodeBlocks(List<Token> input) { |
| ImmutableList.Builder<Token> output = ImmutableList.builder(); |
| for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { |
| if (tokens.peek().getType() != PRE_OPEN_TAG) { |
| output.add(tokens.next()); |
| continue; |
| } |
| |
| output.add(tokens.next()); |
| List<Token> initialNewlines = new ArrayList<>(); |
| while (tokens.hasNext() && tokens.peek().getType() == FORCED_NEWLINE) { |
| initialNewlines.add(tokens.next()); |
| } |
| if (tokens.peek().getType() != LITERAL |
| || !tokens.peek().getValue().matches("[ \t]*[{]@code")) { |
| output.addAll(initialNewlines); |
| output.add(tokens.next()); |
| continue; |
| } |
| |
| deindentPreCodeBlock(output, tokens); |
| } |
| return output.build(); |
| } |
| |
| private static void deindentPreCodeBlock( |
| ImmutableList.Builder<Token> output, PeekingIterator<Token> tokens) { |
| Deque<Token> saved = new ArrayDeque<>(); |
| output.add(new Token(LITERAL, tokens.next().getValue().trim())); |
| while (tokens.hasNext() && tokens.peek().getType() != PRE_CLOSE_TAG) { |
| Token token = tokens.next(); |
| saved.addLast(token); |
| } |
| while (!saved.isEmpty() && saved.peekFirst().getType() == FORCED_NEWLINE) { |
| saved.removeFirst(); |
| } |
| while (!saved.isEmpty() && saved.peekLast().getType() == FORCED_NEWLINE) { |
| saved.removeLast(); |
| } |
| if (saved.isEmpty()) { |
| return; |
| } |
| |
| // move the trailing `}` to its own line |
| Token last = saved.peekLast(); |
| boolean trailingBrace = false; |
| if (last.getType() == LITERAL && last.getValue().endsWith("}")) { |
| saved.removeLast(); |
| if (last.length() > 1) { |
| saved.addLast( |
| new Token(LITERAL, last.getValue().substring(0, last.getValue().length() - 1))); |
| saved.addLast(new Token(FORCED_NEWLINE, null)); |
| } |
| trailingBrace = true; |
| } |
| |
| int trim = -1; |
| for (Token token : saved) { |
| if (token.getType() == LITERAL) { |
| int idx = CharMatcher.isNot(' ').indexIn(token.getValue()); |
| if (idx != -1 && (trim == -1 || idx < trim)) { |
| trim = idx; |
| } |
| } |
| } |
| |
| output.add(new Token(FORCED_NEWLINE, "\n")); |
| for (Token token : saved) { |
| if (token.getType() == LITERAL) { |
| output.add( |
| new Token( |
| LITERAL, |
| trim > 0 && token.length() > trim |
| ? token.getValue().substring(trim) |
| : token.getValue())); |
| } else { |
| output.add(token); |
| } |
| } |
| |
| if (trailingBrace) { |
| output.add(new Token(LITERAL, "}")); |
| } else { |
| output.add(new Token(FORCED_NEWLINE, "\n")); |
| } |
| } |
| |
| private static final CharMatcher NEWLINE = CharMatcher.is('\n'); |
| |
| private static boolean hasMultipleNewlines(String s) { |
| return NEWLINE.countIn(s) > 1; |
| } |
| |
| /* |
| * This also eats any trailing whitespace. We would be smart enough to ignore that, anyway -- |
| * except in the case of <pre>/<table>, inside which we otherwise leave whitespace intact. |
| * |
| * We'd remove the trailing whitespace later on (in JavaCommentsHelper.rewrite), but I feel safer |
| * stripping it now: It otherwise might confuse our line-length count, which we use for wrapping. |
| */ |
| private static final Pattern NEWLINE_PATTERN = compile("^[ \t]*\n[ \t]*[*]?[ \t]?"); |
| |
| // We ensure elsewhere that we match this only at the beginning of a line. |
| // Only match tags that start with a lowercase letter, to avoid false matches on unescaped |
| // annotations inside code blocks. |
| // Match "@param <T>" specially in case the <T> is a <P> or other HTML tag we treat specially. |
| private static final Pattern FOOTER_TAG_PATTERN = compile("^@(param\\s+<\\w+>|[a-z]\\w*)"); |
| private static final Pattern MOE_BEGIN_STRIP_COMMENT_PATTERN = |
| compile("^<!--\\s*MOE:begin_intracomment_strip\\s*-->"); |
| private static final Pattern MOE_END_STRIP_COMMENT_PATTERN = |
| compile("^<!--\\s*MOE:end_intracomment_strip\\s*-->"); |
| private static final Pattern HTML_COMMENT_PATTERN = fullCommentPattern(); |
| private static final Pattern PRE_OPEN_PATTERN = openTagPattern("pre"); |
| private static final Pattern PRE_CLOSE_PATTERN = closeTagPattern("pre"); |
| private static final Pattern CODE_OPEN_PATTERN = openTagPattern("code"); |
| private static final Pattern CODE_CLOSE_PATTERN = closeTagPattern("code"); |
| private static final Pattern TABLE_OPEN_PATTERN = openTagPattern("table"); |
| private static final Pattern TABLE_CLOSE_PATTERN = closeTagPattern("table"); |
| private static final Pattern LIST_OPEN_PATTERN = openTagPattern("ul|ol|dl"); |
| private static final Pattern LIST_CLOSE_PATTERN = closeTagPattern("ul|ol|dl"); |
| private static final Pattern LIST_ITEM_OPEN_PATTERN = openTagPattern("li|dt|dd"); |
| private static final Pattern LIST_ITEM_CLOSE_PATTERN = closeTagPattern("li|dt|dd"); |
| private static final Pattern HEADER_OPEN_PATTERN = openTagPattern("h[1-6]"); |
| private static final Pattern HEADER_CLOSE_PATTERN = closeTagPattern("h[1-6]"); |
| private static final Pattern PARAGRAPH_OPEN_PATTERN = openTagPattern("p"); |
| private static final Pattern PARAGRAPH_CLOSE_PATTERN = closeTagPattern("p"); |
| private static final Pattern BLOCKQUOTE_OPEN_PATTERN = openTagPattern("blockquote"); |
| private static final Pattern BLOCKQUOTE_CLOSE_PATTERN = closeTagPattern("blockquote"); |
| private static final Pattern BR_PATTERN = openTagPattern("br"); |
| private static final Pattern INLINE_TAG_OPEN_PATTERN = compile("^[{]@\\w*"); |
| /* |
| * We exclude < so that we don't swallow following HTML tags. This lets us fix up "foo<p>" (~400 |
| * hits in Google-internal code). We will join unnecessarily split "words" (like "foo<b>bar</b>") |
| * in a later step. There's a similar story for braces. I'm not sure I actually need to exclude @ |
| * or *. TODO(cpovirk): Try removing them. |
| * |
| * Thanks to the "rejoin" step in joinAdjacentLiteralsAndAdjacentWhitespace(), we could get away |
| * with matching only one character here. That would eliminate the need for the regex entirely. |
| * That might be faster or slower than what we do now. |
| */ |
| private static final Pattern LITERAL_PATTERN = compile("^.[^ \t\n@<{}*]*", DOTALL); |
| |
| private static Pattern fullCommentPattern() { |
| return compile("^<!--.*?-->", DOTALL); |
| } |
| |
| private static Pattern openTagPattern(String namePattern) { |
| return compile(format("^<(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE); |
| } |
| |
| private static Pattern closeTagPattern(String namePattern) { |
| return compile(format("^</(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE); |
| } |
| |
| static class LexException extends Exception {} |
| } |