blob: f1b8e06e22432c63c872c97a3991493671a6a3ae [file] [log] [blame]
//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/FormatVariadic.h"
#include <memory>
#include <utility>
namespace clang {
namespace pseudo {
namespace {
static const llvm::StringRef OptSuffix = "_opt";
static const llvm::StringRef StartSymbol = "_";
// Builds grammar from BNF files.
class GrammarBuilder {
public:
GrammarBuilder(std::vector<std::string> &Diagnostics)
: Diagnostics(Diagnostics) {}
Grammar build(llvm::StringRef BNF) {
auto Specs = eliminateOptional(parse(BNF));
assert(llvm::all_of(Specs,
[](const RuleSpec &R) {
if (R.Target.ends_with(OptSuffix))
return false;
return llvm::all_of(
R.Sequence, [](const RuleSpec::Element &E) {
return !E.Symbol.ends_with(OptSuffix);
});
}) &&
"Optional symbols should be eliminated!");
auto T = std::make_unique<GrammarTable>();
// Assemble the name->ID and ID->nonterminal name maps.
llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
for (uint16_t I = 0; I < NumTerminals; ++I)
SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
auto Consider = [&](llvm::StringRef Name) {
if (!SymbolIds.count(Name))
UniqueNonterminals.insert(Name);
};
for (const auto &Spec : Specs) {
Consider(Spec.Target);
for (const RuleSpec::Element &Elt : Spec.Sequence) {
Consider(Elt.Symbol);
for (const auto& KV : Elt.Attributes)
UniqueAttributeValues.insert(KV.second);
}
}
for (llvm::StringRef Name : UniqueNonterminals) {
T->Nonterminals.emplace_back();
T->Nonterminals.back().Name = Name.str();
}
assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
"Too many nonterminals to fit in SymbolID bits!");
llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
const GrammarTable::Nonterminal &R) {
return L.Name < R.Name;
});
// Add an empty string for the corresponding sentinel unset attribute.
T->AttributeValues.push_back("");
UniqueAttributeValues.erase("");
for (llvm::StringRef Name : UniqueAttributeValues) {
T->AttributeValues.emplace_back();
T->AttributeValues.back() = Name.str();
}
llvm::sort(T->AttributeValues);
assert(T->AttributeValues.front() == "");
// Build name -> ID maps for nonterminals.
for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
// Convert the rules.
T->Rules.reserve(Specs.size());
std::vector<SymbolID> Symbols;
auto Lookup = [SymbolIds](llvm::StringRef Name) {
auto It = SymbolIds.find(Name);
assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
return It->second;
};
for (const auto &Spec : Specs) {
assert(Spec.Sequence.size() <= Rule::MaxElements);
Symbols.clear();
for (const RuleSpec::Element &Elt : Spec.Sequence)
Symbols.push_back(Lookup(Elt.Symbol));
T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
applyAttributes(Spec, *T, T->Rules.back());
}
assert(T->Rules.size() < (1 << RuleBits) &&
"Too many rules to fit in RuleID bits!");
const auto &SymbolOrder = getTopologicalOrder(T.get());
llvm::stable_sort(
T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
// Sorted by the topological order of the nonterminal Target.
return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
});
for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
return SymbolOrder[R.Target] < SymbolOrder[SID];
});
RuleID Start = StartIt - T->Rules.begin();
RuleID End = Start;
while (End < T->Rules.size() && T->Rules[End].Target == SID)
++End;
T->Nonterminals[SID].RuleRange = {Start, End};
}
Grammar G(std::move(T));
diagnoseGrammar(G);
return G;
}
// Gets topological order for nonterminal symbols.
//
// The topological order is defined as: if a *single* nonterminal A produces
// (or transitively) a nonterminal B (that said, there is a production rule
// B := A), then A is less than B.
//
// It returns the sort key for each symbol, the array is indexed by SymbolID.
std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
for (const auto &Rule : T->Rules) {
// if A := B, A depends on B.
if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
}
llvm::sort(Dependencies);
std::vector<SymbolID> Order;
// Each nonterminal state flows: NotVisited -> Visiting -> Visited.
enum State {
NotVisited,
Visiting,
Visited,
};
std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
if (VisitStates[SID] == Visited)
return;
if (VisitStates[SID] == Visiting) {
Diagnostics.push_back(
llvm::formatv("The grammar contains a cycle involving symbol {0}",
T->Nonterminals[SID].Name));
return;
}
VisitStates[SID] = Visiting;
for (auto It = llvm::lower_bound(Dependencies,
std::pair<SymbolID, SymbolID>{SID, 0});
It != Dependencies.end() && It->first == SID; ++It)
DFS(It->second);
VisitStates[SID] = Visited;
Order.push_back(SID);
};
for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
DFS(ID);
std::vector<unsigned> Result(T->Nonterminals.size(), 0);
for (size_t I = 0; I < Order.size(); ++I)
Result[Order[I]] = I;
return Result;
}
private:
// Text representation of a BNF grammar rule.
struct RuleSpec {
llvm::StringRef Target;
struct Element {
llvm::StringRef Symbol; // Name of the symbol
// Attributes that are associated to the sequence symbol or rule.
std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
Attributes;
};
std::vector<Element> Sequence;
std::string toString() const {
std::vector<llvm::StringRef> Body;
for (const auto &E : Sequence)
Body.push_back(E.Symbol);
return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
}
};
std::vector<RuleSpec> parse(llvm::StringRef Lines) {
std::vector<RuleSpec> Specs;
for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
Line = Line.trim();
// Strip anything coming after the '#' (comment).
Line = Line.take_while([](char C) { return C != '#'; });
if (Line.empty())
continue;
RuleSpec Rule;
if (parseLine(Line, Rule))
Specs.push_back(std::move(Rule));
}
return Specs;
}
bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
auto Parts = Line.split(":=");
if (Parts.first == Line) { // no separator in Line
Diagnostics.push_back(
llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
return false;
}
Out.Target = Parts.first.trim();
Out.Sequence.clear();
for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
Chunk = Chunk.trim();
if (Chunk.empty())
continue; // skip empty
if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
if (Out.Sequence.empty())
continue;
parseAttributes(Chunk, Out.Sequence.back().Attributes);
continue;
}
Out.Sequence.push_back({Chunk, /*Attributes=*/{}});
}
return true;
}
bool parseAttributes(
llvm::StringRef Content,
std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
assert(Content.starts_with("[") && Content.ends_with("]"));
auto KV = Content.drop_front().drop_back().split('=');
Out.push_back({KV.first, KV.second.trim()});
return true;
}
// Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
auto LookupExtensionID = [&T](llvm::StringRef Name) {
const auto It = llvm::partition_point(
T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
assert(It != T.AttributeValues.end() && *It == Name &&
"Didn't find the attribute in AttrValues!");
return It - T.AttributeValues.begin();
};
for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
for (const auto &KV : Spec.Sequence[I].Attributes) {
if (KV.first == "guard") {
R.Guarded = true;
} else if (KV.first == "recover") {
R.Recovery = LookupExtensionID(KV.second);
R.RecoveryIndex = I;
} else {
Diagnostics.push_back(
llvm::formatv("Unknown attribute '{0}'", KV.first).str());
}
}
}
}
// Inlines all _opt symbols.
// For example, a rule E := id +_opt id, after elimination, we have two
// equivalent rules:
// 1) E := id + id
// 2) E := id id
std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
std::vector<RuleSpec> Results;
std::vector<RuleSpec::Element> Storage;
for (const auto &R : Input) {
eliminateOptionalTail(
R.Sequence, Storage, [&Results, &Storage, &R, this]() {
if (Storage.empty()) {
Diagnostics.push_back(
llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
return;
}
Results.push_back({R.Target, Storage});
});
assert(Storage.empty());
}
return Results;
}
void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
std::vector<RuleSpec::Element> &Result,
llvm::function_ref<void()> CB) {
if (Elements.empty())
return CB();
auto Front = Elements.front();
if (!Front.Symbol.ends_with(OptSuffix)) {
Result.push_back(std::move(Front));
eliminateOptionalTail(Elements.drop_front(1), Result, CB);
Result.pop_back();
return;
}
// Enumerate two options: skip the opt symbol, or inline the symbol.
eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
Result.push_back(std::move(Front));
eliminateOptionalTail(Elements.drop_front(1), Result, CB);
Result.pop_back();
}
// Diagnoses the grammar and emit warnings if any.
void diagnoseGrammar(const Grammar &G) {
const auto &T = G.table();
for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
auto Range = T.Nonterminals[SID].RuleRange;
if (Range.Start == Range.End)
Diagnostics.push_back(
llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
llvm::StringRef NameRef = T.Nonterminals[SID].Name;
if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
Diagnostics.push_back(llvm::formatv(
"Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
}
}
llvm::DenseSet<llvm::hash_code> VisitedRules;
for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
const auto &R = T.Rules[RID];
auto Code = llvm::hash_combine(
R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
auto [_, New] = VisitedRules.insert(Code);
if (!New)
Diagnostics.push_back(
llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
}
// symbol-id -> used counts
std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
for (const Rule &R : T.Rules)
for (SymbolID SID : R.seq())
if (isNonterminal(SID))
++UseCounts[SID];
for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
Diagnostics.push_back(
llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
}
std::vector<std::string> &Diagnostics;
};
} // namespace
Grammar Grammar::parseBNF(llvm::StringRef BNF,
std::vector<std::string> &Diagnostics) {
Diagnostics.clear();
return GrammarBuilder(Diagnostics).build(BNF);
}
} // namespace pseudo
} // namespace clang