| /* |
| * Copyright 2014 The Kythe Authors. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Package html is a set of utilities for manipulating html Nodes. |
| package html |
| |
| import ( |
| "bytes" |
| "fmt" |
| "log" |
| |
| "golang.org/x/net/html" |
| ) |
| |
| // Decoration is a template for an HTML node that should span some textual |
| // offsets. |
| type Decoration struct { |
| // 0-based character offsets for the span of the decoration. |
| Start, End int |
| |
| // Template to use as the HTML decoration Node. |
| Node *html.Node |
| } |
| |
| // Decorate will apply the given slice of Decorations to the HTML tree starting |
| // at root. |
| func Decorate(root *html.Node, decor []Decoration) { |
| offsets := textualOffsets(root) |
| |
| for _, d := range decor { |
| nodes := nodeRange(root, offsets, d.Start, d.End) |
| |
| decorNode := copyNode(d.Node) |
| parent := nodes[0].Parent |
| parent.InsertBefore(decorNode, nodes[0]) |
| for _, n := range nodes { |
| parent.RemoveChild(n) |
| decorNode.AppendChild(n) |
| } |
| offsets.update(decorNode, d.Start) |
| } |
| } |
| |
| // nodeOffsets is a structure storing the 0-based character offsets at the |
| // beginning and ending of html.Nodes. |
| type nodeOffsets struct { |
| starts map[*html.Node]int |
| ends map[*html.Node]int |
| } |
| |
| // Bounds returns the starting and ending offset of n. |
| func (o *nodeOffsets) Bounds(n *html.Node) (int, int) { |
| return o.starts[n], o.ends[n] |
| } |
| |
| // textualOffsets returns nodeOffsets for every html.Node in the tree starting |
| // at root. |
| func textualOffsets(root *html.Node) *nodeOffsets { |
| offsets := &nodeOffsets{make(map[*html.Node]int), make(map[*html.Node]int)} |
| offsets.update(root, 0) |
| return offsets |
| } |
| |
| // update updates the nodeOffsets for every html.Node starting at root, assuming |
| // offset is the starting offset for the root html.Node. |
| func (o *nodeOffsets) update(root *html.Node, offset int) int { |
| o.starts[root] = offset |
| |
| if root.Type == html.TextNode { |
| offset += len(root.Data) |
| } else { |
| for n := root.FirstChild; n != nil; n = n.NextSibling { |
| offset = o.update(n, offset) |
| } |
| } |
| |
| o.ends[root] = offset |
| return offset |
| } |
| |
| // nodeRange returns an ordered slice of sibling html.Nodes that span exactly |
| // the range between the given start and end offsets. If necessary, nodeRange |
| // will split html.Nodes so that the returned range is precise. |
| func nodeRange(root *html.Node, offsets *nodeOffsets, start, end int) []*html.Node { |
| if rootStart, rootEnd := offsets.Bounds(root); rootStart > start || rootEnd < end { |
| log.Fatalf("nodeRange: root %d%q Node (%d → %d) does not contain range %d → %d", |
| root.Type, root.Data, rootStart, rootEnd, start, end) |
| } |
| |
| var nodes []*html.Node |
| for n := root.FirstChild; n != nil; n = n.NextSibling { |
| nStart, nEnd := offsets.Bounds(n) |
| if nStart < end && nEnd >= start { |
| nodes = append(nodes, n) |
| } else if nStart > end { |
| break |
| } |
| } |
| |
| if len(nodes) == 1 && nodes[0] != root { |
| return nodeRange(nodes[0], offsets, start, end) |
| } else if len(nodes) == 0 { |
| nodes = []*html.Node{root} |
| } |
| |
| // Slice end nodes, if necessary |
| if rangeStart, _ := offsets.Bounds(nodes[0]); start != rangeStart { |
| _, m := sliceNode(offsets, nodes[0], start) |
| nodes = append([]*html.Node{m}, nodes[1:]...) |
| } |
| if rangeEnd, _ := offsets.Bounds(nodes[len(nodes)-1]); end != rangeEnd { |
| n, _ := sliceNode(offsets, nodes[len(nodes)-1], end) |
| nodes = append(nodes[:len(nodes)-1], n) |
| } |
| |
| return nodes |
| } |
| |
| // sliceNode returns the two halves of the HTML tree starting at node after |
| // splitting it at the given textual offset. |
| func sliceNode(offsets *nodeOffsets, node *html.Node, offset int) (*html.Node, *html.Node) { |
| origStart, origEnd := offsets.Bounds(node) |
| if origStart > offset || origEnd < offset { |
| log.Fatalf("sliceNode: offset %d out of node's span (%d → %d)", offset, origStart, origEnd) |
| } |
| |
| n, m := copyNode(node), copyNode(node) |
| parent := node.Parent |
| if parent != nil { |
| parent.InsertBefore(n, node) |
| parent.InsertBefore(m, node) |
| parent.RemoveChild(node) |
| } |
| |
| switch node.Type { |
| default: |
| log.Fatalf("Unhandled node kind: %d", node.Type) |
| case html.ElementNode: |
| child := node.FirstChild |
| for child != nil { |
| next := child.NextSibling |
| |
| if _, end := offsets.Bounds(child); end <= offset { |
| node.RemoveChild(child) |
| n.AppendChild(child) |
| } else if start, _ := offsets.Bounds(child); start > offset { |
| node.RemoveChild(child) |
| m.AppendChild(child) |
| } else { |
| left, right := sliceNode(offsets, child, offset) |
| node.RemoveChild(left) |
| node.RemoveChild(right) |
| n.AppendChild(left) |
| m.AppendChild(right) |
| } |
| |
| child = next |
| } |
| case html.TextNode: |
| mark := offset - origStart |
| n.Data = node.Data[:mark] |
| m.Data = node.Data[mark:] |
| } |
| |
| if split := offsets.update(n, origStart); split != offset { |
| log.Fatalf("split %d ≠ %d", split, offset) |
| } |
| if newEnd := offsets.update(m, offset); newEnd != origEnd { |
| log.Fatalf("end %d ≠ %d", newEnd, origEnd) |
| } |
| |
| return n, m |
| } |
| |
| // PlainText returns the concatenation of the textual contents for the given |
| // html Nodes. |
| func PlainText(nodes ...*html.Node) string { |
| var text bytes.Buffer |
| for _, n := range nodes { |
| switch n.Type { |
| default: |
| log.Fatalf("PlainText: unhandled node kind: %d", n.Type) |
| case html.DocumentNode, html.ElementNode: |
| for child := n.FirstChild; child != nil; child = child.NextSibling { |
| fmt.Fprint(&text, PlainText(child)) |
| } |
| case html.TextNode: |
| fmt.Fprintf(&text, n.Data) |
| } |
| } |
| return text.String() |
| } |
| |
| // copyNode returns a shallow copy of n excluding sibling/parent/child pointers. |
| func copyNode(n *html.Node) *html.Node { |
| return &html.Node{ |
| Type: n.Type, |
| Data: n.Data, |
| DataAtom: n.DataAtom, |
| Namespace: n.Namespace, |
| Attr: n.Attr, |
| } |
| } |
| |
| // Zip returns the Node at the end of the specified path where path contains |
| // only the following characters: |
| // 'u' Parent |
| // 'f' FirstChild |
| // 'l' LastChild |
| // 'n' NextSibling |
| // 'p' PrevSibling |
| func Zip(n *html.Node, path string) (*html.Node, error) { |
| for i, step := range path { |
| if n == nil { |
| return nil, fmt.Errorf("Ran into nil Node after %d steps: %q", i, path) |
| } |
| switch step { |
| case 'f': |
| n = n.FirstChild |
| case 'l': |
| n = n.LastChild |
| case 'n': |
| n = n.NextSibling |
| case 'p': |
| n = n.PrevSibling |
| case 'u': |
| n = n.Parent |
| default: |
| return nil, fmt.Errorf("Invalid zip path (%q) rune: %q", path, step) |
| } |
| } |
| return n, nil |
| } |
| |
| // MustZip delegates to Zip and panics on any error. |
| func MustZip(n *html.Node, path string) *html.Node { |
| res, err := Zip(n, path) |
| if err != nil { |
| panic(err) |
| } |
| return res |
| } |