blob: 7c083bbf249840a798c0d8057c59d4f03b02576e [file] [log] [blame]
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// -*- Mode: C++ -*-
//
// Copyright (C) 2021 Google, Inc.
//
// Author: Giuliano Procida
/// @file
///
/// This file contains ABI XML manipulation routines and a main driver.
///
/// The libxml Tree API is used. The XPath API is not used as it proved
/// to be many times slower than direct traversal but only slightly more
/// convenient.
#include <fcntl.h>
#include <unistd.h>
#include <cstring>
#include <functional>
#include <ios>
#include <iostream>
#include <map>
#include <optional>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include <libxml/globals.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
/// Cast a C string to a libxml string.
///
/// @param str the C string (pointer)
///
/// @return the same thing, as a type compatible with the libxml API
static const xmlChar*
to_libxml(const char* str)
{
return reinterpret_cast<const xmlChar*>(str);
}
/// Cast a libxml string to C string.
///
/// @param str the libxml string (pointer)
///
/// @return the same thing, as a type compatible with the C library API
static const char*
from_libxml(const xmlChar* str)
{
return reinterpret_cast<const char*>(str);
}
/// Remove a node from its document and free its storage.
///
/// @param node the node to remove
static void
remove_node(xmlNodePtr node)
{
xmlUnlinkNode(node);
xmlFreeNode(node);
}
/// Remove an XML element and any immediately preceding comment.
///
/// @param node the element to remove
static void
remove_element(xmlNodePtr node)
{
xmlNodePtr previous_node = node->prev;
if (previous_node && previous_node->type == XML_COMMENT_NODE)
remove_node(previous_node);
remove_node(node);
}
/// Get child nodes of given node.
///
/// @param node the node whose children to fetch
///
/// @return a vector of child nodes
static std::vector<xmlNodePtr>
get_children(xmlNodePtr node)
{
std::vector<xmlNodePtr> result;
for (xmlNodePtr child = node->children; child; child = child->next)
result.push_back(child);
return result;
}
/// Fetch an attribute from a node.
///
/// @param node the node
///
/// @param name the attribute name
///
/// @return the attribute value, if present
static std::optional<std::string>
get_attribute(xmlNodePtr node, const char* name)
{
std::optional<std::string> result;
xmlChar* attribute = xmlGetProp(node, to_libxml(name));
if (attribute)
{
result = from_libxml(attribute);
xmlFree(attribute);
}
return result;
}
/// Store an attribute value.
///
/// @param node the node
///
/// @param name the attribute name
///
/// @param value the attribute value, optionally
static void
set_attribute(xmlNodePtr node, const char* name,
const std::optional<std::string>& value)
{
if (value)
xmlSetProp(node, to_libxml(name), to_libxml(value.value().c_str()));
else
xmlUnsetProp(node, to_libxml(name));
}
/// Remove text nodes, recursively.
///
/// This simplifies subsequent analysis and manipulation. Removing and
/// moving elements will destroy formatting anyway. The only remaining
/// node types should be elements and comments.
///
/// @param node the node to process
static void
strip_text(xmlNodePtr node)
{
if (node->type == XML_TEXT_NODE)
remove_node(node);
else if (node->type == XML_ELEMENT_NODE)
for (xmlNodePtr child : get_children(node))
strip_text(child);
}
/// Add text before / after a node.
///
/// @param node the node
///
/// @param after whether the next should go after
///
/// @param text the text
static void
add_text(xmlNodePtr node, bool after, const std::string& text)
{
xmlNodePtr text_node = xmlNewTextLen(to_libxml(text.data()), text.size());
if (after)
xmlAddNextSibling(node, text_node);
else
xmlAddPrevSibling(node, text_node);
}
/// Format an XML elementy by adding internal indentation and newlines.
///
/// This makes the XML readable.
///
/// @param indentation what to add to the line indentation prefix
///
/// @param prefix the current line indentation prefix
///
/// @param node the node to format
static void
format_xml(const std::string& indentation, std::string prefix, xmlNodePtr node)
{
std::vector<xmlNodePtr> children = get_children(node);
if (children.empty())
return;
// The ordering of operations here is incidental. The outcomes we want
// are: 1. an extra newline after the opening tag and indentation of
// the closing tag to match, and 2. indentation and newline for each
// child.
add_text(children[0], false, "\n");
add_text(children[children.size() - 1], true, prefix);
prefix += indentation;
for (xmlNodePtr child : children)
{
add_text(child, false, prefix);
format_xml(indentation, prefix, child);
add_text(child, true, "\n");
}
}
/// Rewrite attributes using single quotes.
///
/// libxml uses double quotes but libabigail uses single quotes.
///
/// Note that libabigail does not emit attributes *containing* single
/// quotes and if it did it would escape them as &quot; which libxml
/// would in turn preserve. However, the code here will handle all forms
/// of quotes, conservatively.
///
/// Annotation comments can contain single quote characters so just
/// checking for any single quotes at all is insufficiently precise.
///
/// @param start a pointer to the start of the XML text
///
/// @param limit a pointer to just past the end of the XML text
static void
adjust_quotes(xmlChar* start, xmlChar* limit)
{
const std::string open{"<!--"};
const std::string close{"-->"};
while (start < limit)
{
// Look for a '<'
start = std::find(start, limit, '<');
if (start == limit)
break;
if (start + open.size() < limit
&& std::equal(open.begin(), open.end(), start))
{
// Have a comment, skip to the end.
start += open.size();
xmlChar* end = std::search(start, limit, close.begin(), close.end());
if (end == limit)
break;
start = end + close.size();
}
else
{
// Have some tag, search for the end.
start += 1;
xmlChar* end = std::find(start, limit, '>');
if (end == limit)
break;
// In general, inside a tag we could find either ' or " being
// used to quote attributes and the other quote character
// being used as part of the attribute data. However, libxml's
// xmlDocDump* functions use " to quote attributes and it's
// safe to substitute this quote character with ' so long as '
// does not appear within the attribute data.
if (std::find(start, end, '\'') == end)
for (xmlChar* c = start; c < end; ++c)
if (*c == '"')
*c = '\'';
start = end + 1;
}
}
}
static const std::set<std::string> DROP_IF_EMPTY = {
"elf-variable-symbols",
"elf-function-symbols",
"namespace-decl",
"abi-instr",
"abi-corpus",
"abi-corpus-group",
};
/// Drop empty elements, if safe to do so, recursively.
///
/// @param element node to process
static void
drop_empty(xmlNodePtr node)
{
if (node->type != XML_ELEMENT_NODE)
return;
for (xmlNodePtr child : get_children(node))
drop_empty(child);
// Do not drop the root element, even if empty.
if (node->parent->type == XML_DOCUMENT_NODE)
return;
if (!node->children && DROP_IF_EMPTY.count(from_libxml(node->name)))
remove_element(node);
}
/// Prune unreachable elements.
///
/// Reachability is defined to be union of contains, containing and
/// refers-to relationships for types, declarations and symbols. The
/// roots for reachability are the ELF elements in the ABI.
///
/// @param document the XML document to process
static void
prune_unreachable(xmlDocPtr document)
{
// ELF symbol names.
std::set<std::string> elf_symbols;
// Simple way of allowing two kinds of nodes: false=>type,
// true=>symbol.
using vertex_t = std::pair<bool, std::string>;
// Graph vertices.
std::set<vertex_t> vertices;
// Graph edges.
std::map<vertex_t, std::set<vertex_t>> edges;
// Keep track of type / symbol nesting so we can identify contains,
// containing and refers-to relationships.
std::vector<vertex_t> stack;
// Process an XML node, adding a vertex and possibly some edges.
std::function<void(xmlNodePtr)> process_node = [&](xmlNodePtr node) {
// We only care about elements and not comments, at this stage.
if (node->type != XML_ELEMENT_NODE)
return;
// Is this an ELF symbol?
if (!strcmp(from_libxml(node->name), "elf-symbol"))
{
const auto name = get_attribute(node, "name");
if (name)
elf_symbols.insert(name.value());
// Early return is safe, but not necessary.
return;
}
// Is this a type? Note that the same id may appear multiple times.
const auto id = get_attribute(node, "id");
if (id)
{
vertex_t type_vertex{false, id.value()};
vertices.insert(type_vertex);
const auto naming_typedef_id = get_attribute(node, "naming-typedef-id");
if (naming_typedef_id)
{
// This is an odd one, there can be a backwards link from an
// anonymous type to the typedef that refers to it. We could
// either remove these links when pruning a typedef but it's
// simpler just to pull in the typedef, even if nothing else
// refers to it.
vertex_t naming_typedef_vertex{false, naming_typedef_id.value()};
edges[type_vertex].insert(naming_typedef_vertex);
}
if (!stack.empty())
{
// Parent<->child dependencies; record dependencies both ways to
// avoid holes in XML types and declarations.
const auto& parent = stack.back();
edges[parent].insert(type_vertex);
edges[type_vertex].insert(parent);
}
// Record the type.
stack.push_back(type_vertex);
}
// Is this a (declaration expected to be linked to a) symbol?
const auto symbol = get_attribute(node, "mangled-name");
if (symbol)
{
vertex_t symbol_vertex{true, symbol.value()};
vertices.insert(symbol_vertex);
if (!stack.empty())
{
// Parent<->child dependencies; record dependencies both ways to
// avoid making holes in XML types and declarations.
//
// Symbols exist outside of the type hierarchy, so choosing to make
// them depend on a containing type scope and vice versa is
// conservative and probably not necessary.
const auto& parent = stack.back();
edges[parent].insert(symbol_vertex);
edges[symbol_vertex].insert(parent);
}
// Record the symbol.
stack.push_back(symbol_vertex);
// In practice there will be at most one symbol on the stack; we could
// verify this here, but it wouldn't achieve anything.
}
// Being both would make the stack ordering ambiguous.
if (id && symbol)
{
std::cerr << "cannot handle element which is both type and symbol\n";
exit(1);
}
// Is there a reference to another type?
const auto type_id = get_attribute(node, "type-id");
if (type_id && !stack.empty())
{
// The enclosing type or symbol refers to another type.
const auto& parent = stack.back();
vertex_t type_id_vertex{false, type_id.value()};
edges[parent].insert(type_id_vertex);
}
// Process recursively.
for (auto child : get_children(node))
process_node(child);
// Restore the stack.
if (symbol)
stack.pop_back();
if (id)
stack.pop_back();
};
// Traverse the whole XML document and build a graph.
for (xmlNodePtr node = document->children; node; node = node->next)
process_node(node);
// Simple DFS.
std::set<vertex_t> seen;
std::function<void(vertex_t)> dfs = [&](vertex_t vertex) {
if (!seen.insert(vertex).second)
return;
auto it = edges.find(vertex);
if (it != edges.end())
for (auto to : it->second)
dfs(to);
};
// Traverse the graph, starting from the ELF symbols.
for (const auto& symbol : elf_symbols)
{
vertex_t symbol_vertex{true, symbol};
if (vertices.count(symbol_vertex))
dfs(symbol_vertex);
else
std::cerr << "no declaration found for ELF symbol " << symbol << '\n';
}
// This is a DFS with early stopping.
std::function<void(xmlNodePtr)> remove_unseen = [&](xmlNodePtr node) {
if (node->type != XML_ELEMENT_NODE)
return;
// Return if we know that this is a type to keep or drop in its
// entirety.
const auto id = get_attribute(node, "id");
if (id)
{
if (!seen.count(vertex_t{false, id.value()}))
remove_element(node);
return;
}
// Return if we know that this is a declaration to keep or drop in
// its entirety. Note that var-decl and function-decl are the only
// elements that can have a mangled-name attribute.
const char* node_name = from_libxml(node->name);
if (!strcmp(node_name, "var-decl") || !strcmp(node_name, "function-decl"))
{
const auto symbol = get_attribute(node, "mangled-name");
if (!(symbol && seen.count(vertex_t{true, symbol.value()})))
remove_element(node);
return;
}
// Otherwise, this is not a type, declaration or part thereof, so
// process child elements.
for (auto child : get_children(node))
remove_unseen(child);
};
// Traverse the document, removing unseen elements.
for (xmlNodePtr node = document->children; node; node = node->next)
remove_unseen(node);
}
static const std::map<std::string, std::string> ANONYMOUS_TYPE_NAMES = {
{"enum-decl", "__anonymous_enum__"},
{"class-decl", "__anonymous_struct__"},
{"union-decl", "__anonymous_union__"},
};
/// Normalise anonymous type names by removing the numerical suffix.
///
/// Anonymous type names take the form __anonymous_foo__N where foo is
/// one of enum, struct or union and N is an optional numerical suffix.
/// The suffices are senstive to processing order and do not convey
/// useful ABI information. They can cause spurious harmless diffs and
/// make XML diffing and rebasing harder.
///
/// @param node the XML node to process
static void
normalise_anonymous_type_names(xmlNodePtr node)
{
if (node->type != XML_ELEMENT_NODE)
return;
const auto it = ANONYMOUS_TYPE_NAMES.find(from_libxml(node->name));
if (it != ANONYMOUS_TYPE_NAMES.end())
if (const auto attribute = get_attribute(node, "name"))
{
const auto& anon = it->second;
const auto& name = attribute.value();
// __anonymous_foo__123 -> __anonymous_foo__
if (!name.compare(0, anon.size(), anon) &&
name.find_first_not_of("0123456789", anon.size()) == name.npos)
set_attribute(node, "name", anon);
}
for (auto child : get_children(node))
normalise_anonymous_type_names(child);
}
/// Main program.
///
/// Read and write ABI XML, with optional extra processing passes.
///
/// @param argc argument count
///
/// @param argv atgument vector
///
/// @return exit status
int
main(int argc, char* argv[])
{
// Defaults.
const char* opt_input = NULL;
const char* opt_output = NULL;
int opt_indentation = 2;
bool opt_drop_empty = false;
bool opt_prune_unreachable = false;
bool opt_normalise_anonymous = false;
// Process command line.
auto usage = [&]() -> int {
std::cerr << "usage: " << argv[0]
<< " [-i|--input file]"
<< " [-o|--output file]"
<< " [-I|--indentation n]"
<< " [-a|--all]"
<< " [-d|--[no-]drop-empty]"
<< " [-p|--[no-]prune-unreachable]"
<< " [-n|--[no-]normalise-anonymous]"
<< '\n';
return 1;
};
int opt_index = 1;
auto get_arg = [&]() {
if (opt_index < argc)
return argv[opt_index++];
exit(usage());
};
while (opt_index < argc)
{
const char* arg = get_arg();
if (!strcmp(arg, "-i") || !strcmp(arg, "--input"))
opt_input = get_arg();
else if (!strcmp(arg, "-o") || !strcmp(arg, "--output"))
opt_output = get_arg();
else if (!strcmp(arg, "-I") || !strcmp(arg, "--indentation"))
{
std::istringstream is(get_arg());
is >> std::noskipws >> opt_indentation;
if (!is || !is.eof() || opt_indentation < 0)
exit(usage());
}
else if (!strcmp(arg, "-a") || !strcmp(arg, "--all"))
opt_drop_empty = opt_prune_unreachable = opt_normalise_anonymous = true;
else if (!strcmp(arg, "-d") || !strcmp(arg, "--drop-empty"))
opt_drop_empty = true;
else if (!strcmp(arg, "--no-drop-empty"))
opt_drop_empty = false;
else if (!strcmp(arg, "-p") || !strcmp(arg, "--prune-unreachable"))
opt_prune_unreachable = true;
else if (!strcmp(arg, "--no-prune-unreachable"))
opt_prune_unreachable = false;
else if (!strcmp(arg, "-n") || !strcmp(arg, "--normalise-anonymous"))
opt_normalise_anonymous = true;
else if (!strcmp(arg, "--no-normalise-anonymous"))
opt_normalise_anonymous = false;
else
exit(usage());
}
// Open input for reading.
int in_fd = STDIN_FILENO;
if (opt_input)
{
in_fd = open(opt_input, O_RDONLY);
if (in_fd < 0)
{
std::cerr << "could not open '" << opt_input << "' for reading: "
<< strerror(errno) << '\n';
exit(1);
}
}
// Read the XML.
xmlParserCtxtPtr parser_context = xmlNewParserCtxt();
xmlDocPtr document = xmlCtxtReadFd(parser_context, in_fd, NULL, NULL, 0);
if (!document)
{
std::cerr << "failed to parse input as XML\n";
exit(1);
}
xmlFreeParserCtxt(parser_context);
close(in_fd);
// Strip text nodes to simplify other operations.
for (xmlNodePtr node = document->children; node; node = node->next)
strip_text(node);
// Normalise anonymous type names.
if (opt_normalise_anonymous)
for (xmlNodePtr node = document->children; node; node = node->next)
normalise_anonymous_type_names(node);
// Prune unreachable elements.
if (opt_prune_unreachable)
prune_unreachable(document);
// Drop empty elements.
if (opt_drop_empty)
for (xmlNodePtr node = document->children; node; node = node->next)
drop_empty(node);
// Reformat XML for human consumption.
for (xmlNodePtr node = document->children; node; node = node->next)
format_xml(std::string(opt_indentation, ' '), std::string(), node);
// Open output for writing.
int out_fd = STDOUT_FILENO;
if (opt_output)
{
out_fd = open(opt_output, O_CREAT | O_TRUNC | O_WRONLY,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
if (out_fd < 0)
{
std::cerr << "could not open '" << opt_output << "' for writing: "
<< strerror(errno) << '\n';
exit(1);
}
}
// Write the XML.
//
// First to memory, as we need to do a little post-processing.
xmlChar* out_data;
int out_size;
xmlDocDumpMemory(document, &out_data, &out_size);
// Remove the XML declaration as this is not currently accepted by abidiff.
xmlChar* out_limit = out_data + out_size;
while (out_data < out_limit && *out_data != '\n')
++out_data;
if (out_data < out_limit)
++out_data;
// Adjust quotes to match abidw.
adjust_quotes(out_data, out_limit);
// And now to a file.
size_t count = out_limit - out_data;
if (write(out_fd, out_data, count) != count)
{
std::cerr << "could not write output: " << strerror(errno) << '\n';
exit(1);
}
if (close(out_fd) < 0)
{
std::cerr << "could not close output: " << strerror(errno) << '\n';
exit(1);
}
// Free libxml document.
xmlFreeDoc(document);
return 0;
}