| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.clearsilver.jsilver.template; |
| |
| import java.io.IOException; |
| |
| /** |
| * HTML whitespace stripper to be used by JSilver. It removes leading and |
| * trailing whitespace, it reduces contiguous whitespace characters with just |
| * the first character, and removes lines of nothing but whitespace. |
| * |
| * It does not strip whitespace inside the following elements: |
| * <ul> |
| * <li> PRE |
| * <li> VERBATIM |
| * <li> TEXTAREA |
| * <li> SCRIPT |
| * </ul> |
| * It also strips out empty lines and leading whitespace inside HTML tags (i.e. |
| * between '<' and '>') and inside SCRIPT elements. It leaves trailing |
| * whitespace since that is more costly to remove and tends to not be common |
| * based on how templates are created (they don't have trailing whitespace). |
| * <p> |
| * Loadtests indicate that this class can strip whitespace almost as quickly |
| * as just reading every character from a string (20% slower). |
| * <p> |
| * While not strictly compatible with the JNI Clearsilver whitestripping |
| * function, we are not aware of any differences that yield functionally |
| * different HTML output. However, we encourage users to verify for themselves |
| * and report any differences. |
| */ |
| public class HtmlWhiteSpaceStripper implements Appendable { |
| |
| // Object to output stripped content to. |
| private final Appendable out; |
| // Level of whitespace stripping to perform. (Currently not used). |
| // TODO: Determine what the exact differences are in levels in |
| // JNI Clearsilver and see if it is worth porting it. |
| private final int level; |
| |
| // Has any non-whitespace character been seen since the start of the line. |
| private boolean nonWsSeen = false; |
| // Was there previously one or more whitespace chars? If so, we should output |
| // the first whitespace char in the sequence before any other non-whitespace |
| // character. 0 signifies no pending whitespace. |
| private char pendingWs = 0; |
| |
| // We just saw the start of an HTML tag '<'. |
| private boolean startHtmlTag = false; |
| // Are we currently in an opening HTML tag (not "</"). |
| private boolean inOpenTag = false; |
| // Are we currently in a closing HTML tag. |
| private boolean inCloseTag = false; |
| // Are we currently in an HTML tag name. |
| private boolean inTagName = false; |
| |
| // Are we between <textarea> tags |
| private int textAreaScope = 0; |
| // Are we between <pre> tags |
| private int preScope = 0; |
| // Are we between verbatim flags |
| private int verbatimScope = 0; |
| // Are we between <script> tags |
| private int scriptScope = 0; |
| |
| // Used to hold HTML tag element name. |
| private StringBuilder tagName = new StringBuilder(16); |
| |
| /** |
| * Intermediate Appendable object that strips whitespace as it passes through characters to |
| * another Appendable object. |
| * |
| * @param out The Appendable object to dump the stripped output to. |
| */ |
| public HtmlWhiteSpaceStripper(Appendable out) { |
| this(out, 1); |
| } |
| |
| /** |
| * Intermediate Appendable object that strips whitespace as it passes through characters to |
| * another Appendable object. |
| * |
| * @param out The Appendable object to dump the stripped output to. |
| * @param level Ignored for now. |
| */ |
| public HtmlWhiteSpaceStripper(Appendable out, int level) { |
| this.out = out; |
| this.level = level; |
| } |
| |
| @Override |
| public String toString() { |
| return out.toString(); |
| } |
| |
| @Override |
| public Appendable append(CharSequence csq) throws IOException { |
| return append(csq, 0, csq.length()); |
| } |
| |
| @Override |
| public Appendable append(CharSequence csq, int start, int end) throws IOException { |
| for (int i = start; i < end; i++) { |
| append(csq.charAt(i)); |
| } |
| return this; |
| } |
| |
| @Override |
| public Appendable append(char c) throws IOException { |
| if (inOpenTag || inCloseTag) { |
| // In an HTML tag. |
| if (startHtmlTag) { |
| // This is the first character in an HTML tag. |
| if (c == '/') { |
| // We are in a close tag. |
| inOpenTag = false; |
| inCloseTag = true; |
| } else { |
| // This is the first non-'/' character in an HTML tag. |
| startHtmlTag = false; |
| if (isTagNameStartChar(c)) { |
| // we have a valid tag name first char. |
| inTagName = true; |
| tagName.append(c); |
| } |
| } |
| } else if (inTagName) { |
| // We were last parsing the name of an HTML attribute. |
| if (isTagNameChar(c)) { |
| tagName.append(c); |
| } else { |
| processTagName(); |
| inTagName = false; |
| } |
| } |
| if (c == '>') { |
| // We are at the end of the tag. |
| inOpenTag = inCloseTag = false; |
| nonWsSeen = true; |
| } |
| stripLeadingWsAndEmptyLines(c); |
| } else { |
| // Outside of HTML tag. |
| if (c == '<') { |
| // Starting a new HTML tag. |
| inOpenTag = true; |
| startHtmlTag = true; |
| } |
| if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) { |
| // In an HTML element that we want to preserve whitespace in. |
| out.append(c); |
| } else if (scriptScope > 0) { |
| // Want to remove newlines only. |
| stripLeadingWsAndEmptyLines(c); |
| } else { |
| stripAll(c); |
| } |
| } |
| |
| return this; |
| } |
| |
| private void stripLeadingWsAndEmptyLines(char c) throws IOException { |
| // Detect and delete empty lines. |
| switch (c) { |
| case '\n': |
| if (nonWsSeen) { |
| out.append(c); |
| } |
| nonWsSeen = false; |
| break; |
| case ' ': |
| case '\t': |
| case '\r': |
| if (nonWsSeen) { |
| out.append(c); |
| } |
| break; |
| default: |
| if (!nonWsSeen) { |
| nonWsSeen = true; |
| } |
| out.append(c); |
| } |
| } |
| |
| private void stripAll(char c) throws IOException { |
| // All that remains is content that is safe to remove whitespace from. |
| switch (c) { |
| case '\n': |
| if (nonWsSeen) { |
| // We don't want blank lines so we don't output linefeed unless we |
| // saw non-whitespace. |
| out.append(c); |
| } |
| // We don't want trailing whitespace. |
| pendingWs = 0; |
| nonWsSeen = false; |
| break; |
| case ' ': |
| case '\t': |
| case '\r': |
| if (nonWsSeen) { |
| pendingWs = c; |
| } else { |
| // Omit leading whitespace |
| } |
| break; |
| default: |
| if (pendingWs != 0) { |
| out.append(pendingWs); |
| pendingWs = 0; |
| } |
| nonWsSeen = true; |
| out.append(c); |
| } |
| } |
| |
| private int updateScope(int current, int inc) { |
| current += inc; |
| return current < 0 ? 0 : current; |
| } |
| |
| /** |
| * This code assumes well-formed HTML as input with HTML elements opening and closing properly in |
| * the right order. |
| */ |
| private void processTagName() { |
| inTagName = false; |
| String name = tagName.toString(); |
| tagName.delete(0, tagName.length()); |
| int inc = inOpenTag ? 1 : -1; |
| if ("textarea".equalsIgnoreCase(name)) { |
| textAreaScope = updateScope(textAreaScope, inc); |
| } else if ("pre".equalsIgnoreCase(name)) { |
| preScope = updateScope(preScope, inc); |
| } else if ("verbatim".equalsIgnoreCase(name)) { |
| verbatimScope = updateScope(verbatimScope, inc); |
| } else if ("script".equalsIgnoreCase(name)) { |
| scriptScope = updateScope(scriptScope, inc); |
| } |
| } |
| |
| private boolean isTagNameStartChar(char c) { |
| return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); |
| } |
| |
| // From W3C HTML spec. |
| private boolean isTagNameChar(char c) { |
| return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_') |
| || (c == '-') || (c == ':') || (c == '.'); |
| } |
| |
| /** |
| * Note, we treat '\n' as a separate special character as it has special rules since it determines |
| * what a 'line' of content is for doing leading and trailing whitespace removal and empty line |
| * removal. |
| */ |
| private boolean isWs(char c) { |
| return c == ' ' || c == '\t' || c == '\r'; |
| } |
| } |