blob: 234609b0dbebbb3cfde248af61610e99a5cef366 [file] [log] [blame]
#include <Python.h>
#include <iostream>
#include "torch/csrc/jit/fusion_compiler.h"
#include "torch/csrc/jit/code_template.h"
#include "torch/csrc/assertions.h"
#include "torch/csrc/jit/ir.h"
#include "torch/csrc/jit/attributes.h"
#include "torch/csrc/jit/interned_strings.h"
#include <vector>
#include "torch/csrc/jit/interpreter.h"
namespace torch { namespace jit {
// help build Graphs for tests
struct Var {
Var() : v(nullptr) {}
Var(Value * v) : v(v) {}
static Var Input(Graph & g, std::string name = "") {
return g.addInput(name);
}
void addAsOutput() {
v->owningGraph()->registerOutput(v);
}
static Var cat(ArrayRef<Var> inputs, int32_t dim) {
Node* n;
auto r = create(kcat, inputs, 1, &n)[0];
n->i_(kdim, dim);
return r;
}
static std::vector<Var> create(Symbol kind, ArrayRef<Var> inputs,
int num_outputs = 1,
Node** created_node = nullptr,
Graph * g = nullptr) {
if(g == nullptr) {
g = inputs.at(0).value()->owningGraph();
}
Node * n = g->appendNode(g->create(kind, num_outputs));
for(auto i : inputs) {
n->addInput(i.value());
}
if(created_node) {
*created_node = n;
}
std::vector<Var> out;
for(auto v : n->outputs()) {
out.emplace_back(v);
}
return out;
}
Var operator*(Var rhs) {
return create(kmul, {*this, rhs})[0];
}
Var operator+(Var rhs) {
Node * n;
auto r = create(kadd, {*this, rhs}, 1, &n)[0];
n->t_(kalpha, at::Scalar(1).toTensor());
return r;
}
Var mm(Var rhs) {
return create(s("mm"), {*this, rhs})[0];
}
Var sigmoid() {
return create(ksigmoid, {*this})[0];
}
Var tanh() {
return create(ktanh, {*this})[0];
}
std::vector<Var> chunk(int32_t chunks, uint32_t dim) {
Node * n;
auto r = create(s("chunk"), { *this }, chunks, &n);
n->i_(s("chunks"), chunks)
->i_(s("dim"), dim);
return r;
}
Value * value() const {
return v;
}
private:
static Symbol s(const char * s_) {
return Symbol(s_);
}
Value * v;
};
template<typename T>
static std::ostream & operator<<(std::ostream & out, const std::vector<T> & list) {
size_t i = 0;
out << "{";
for(auto && e : list) {
if(i++ > 0)
out << ", ";
out << e;
}
out << "}";
return out;
}
static auto ct = CodeTemplate(R"(
int foo($args) {
$bar
$bar
$a+$b
}
int commatest(int a${,stuff})
int notest(int a${,empty,})
)");
static auto ct_expect = R"(
int foo(hi, 8) {
what
on many
lines...
7
what
on many
lines...
7
3+4
}
int commatest(int a, things..., others)
int notest(int a)
)";
static void codeTemplateTest() {
{
TemplateEnv e;
e.s("hi","foo");
e.v("what",{"is","this"});
TemplateEnv c(e);
c.s("hi","foo2");
JIT_ASSERT(e.s("hi") == "foo");
JIT_ASSERT(c.s("hi") == "foo2");
JIT_ASSERT(e.v("what")[0] == "is");
}
{
TemplateEnv e;
e.v("args",{"hi","8"});
e.v("bar",{"what\non many\nlines...","7"});
e.s("a","3");
e.s("b","4");
e.v("stuff",{"things...","others"});
e.v("empty",{});
auto s = ct.format(e);
//std::cout << "'" << s << "'\n";
//std::cout << "'" << ct_expect << "'\n";
JIT_ASSERT(s == ct_expect);
}
}
Value * appendNewNode(NodeKind kind, Graph& graph, ArrayRef<Value*> inputs) {
return graph.appendNode(graph.create(kind,inputs))->output();
}
static void fusionTests() {
FusionCompiler comp;
auto testSimple = [&] {
Graph graph;
Var i0 = Var::Input(graph);
Var i1 = Var::Input(graph);
auto o0 = i0 * i1;
o0.addAsOutput();
auto a = at::CUDA(at::kFloat).rand({3,4});
auto b = at::CUDA(at::kFloat).rand({4,3}).transpose(0,1);
auto o = at::CUDA(at::kFloat).zeros({3,4});
comp.debugLaunchGraph(graph, 0, {a,b}, {o});
auto o2 = a*b;
float max_diff = (o2 - o).abs().max().toCDouble();
//std::cout << "max diff: " << max_diff << "\n";
JIT_ASSERT(max_diff == 0);
};
testSimple();
auto testOne = [&](int ti, int tj, int toi, int toj) {
Graph graph;
Var i0 = Var::Input(graph);
Var i1 = Var::Input(graph);
Var i2 = Var::Input(graph);
Var i3 = Var::Input(graph);
Var i4 = Var::Input(graph);
auto p22 = i4.sigmoid();
auto p20 = i3.sigmoid();
auto p18 = i2.tanh();
auto p16 = i1.sigmoid();
auto p14 = p20 * i0;
auto p11 = p22 * p18;
auto o1 = p14 + p11;
auto p5 = o1.tanh();
auto o0 = p16 * p5;
o0.addAsOutput();
o1.addAsOutput();
graph.lint();
std::vector<at::Tensor> inputs;
std::vector<at::Tensor> outputs;
// We want to generate input/output tensors with dimension 128x128x32, but
// with different internal strides. To do this, we generate a tensor
// with the "wrong" dimensions, and then use transpose to get an appropriately
// sized view.
for(size_t i = 0; i < graph.inputs().size(); i++) {
std::vector<int64_t> dims = {128, 128, 32};
std::swap(dims[ti],dims[tj]);
inputs.push_back(at::CUDA(at::kFloat).rand(dims).transpose(ti, tj));
}
for(size_t i = 0; i < graph.outputs().size(); i++) {
std::vector<int64_t> dims = {128, 128, 32};
std::swap(dims[toi],dims[toj]);
outputs.push_back(at::CUDA(at::kFloat).zeros(dims).transpose(toi,toj));
}
auto t22 = inputs[4].sigmoid();
auto t20 = inputs[3].sigmoid();
auto t18 = inputs[2].tanh();
auto t16 = inputs[1].sigmoid();
auto t14 = t20*inputs[0];
auto t11 = t22*t18;
auto out1 = t14+t11;
auto t5 = out1.tanh();
auto out0 = t16*t5;
//auto out0 = inputs[0]*inputs[1];
comp.debugLaunchGraph(graph, 0, inputs, outputs);
JIT_ASSERT(out0.is_same_size(outputs.front()));
float max_diff = (outputs.front() - out0).abs().max().toCDouble();
JIT_ASSERT(max_diff < 1e-6);
};
testOne(0,0,0,0);
testOne(0,1,0,0);
testOne(1,2,0,0);
testOne(0,2,0,0);
testOne(0,0,0,1);
testOne(0,1,1,2);
testOne(1,2,0,2);
auto testConcat = [&](int dim) {
Graph graph;
Var i0 = Var::Input(graph);
Var i1 = Var::Input(graph);
auto o0 = i0 * i1;
o0.addAsOutput();
Var::cat({i0, o0}, dim).addAsOutput();
auto a = at::CUDA(at::kFloat).rand({3,4,5});
auto b = at::CUDA(at::kFloat).rand({4,3,5}).transpose(0,1);
auto o = at::CUDA(at::kFloat).zeros({3,4,5});
auto o_r = a*b;
auto o2_r = at::cat({a, o_r}, dim);
auto o2 = at::CUDA(at::kFloat).zeros(o2_r.sizes());
comp.debugLaunchGraph(graph, 0, {a,b}, {o, o2});
float max_diff = (o_r - o).abs().max().toCDouble();
JIT_ASSERT(max_diff == 0);
float max_diff2 = (o2_r - o2).abs().max().toCDouble();
JIT_ASSERT(max_diff2 == 0);
};
testConcat(0);
testConcat(1);
testConcat(2);
}
struct Attr : public Attributes<Attr> {
};
void attributesTest() {
auto one = kParam;
auto two = kReturn;
auto three = kConstant;
auto four = kSlice;
Attr attr;
attr.f_(one,3.4)->i_(two,5)->s_(three,"what");
JIT_ASSERT(attr.f(one) == 3.4);
JIT_ASSERT(attr.s(three) == "what");
JIT_ASSERT(attr.i(two) == 5);
attr.s_(one,"no");
JIT_ASSERT(attr.s(one) == "no");
JIT_ASSERT(attr.hasAttribute(three));
JIT_ASSERT(!attr.hasAttribute(four));
attr.ss_(two, {"hi", "now"});
JIT_ASSERT(attr.ss(two).at(1) == "now");
Attr attr2;
attr2.copyAttributes(attr);
JIT_ASSERT(attr2.s(one) == "no");
attr2.f_(one,5);
JIT_ASSERT(attr.s(one) == "no");
JIT_ASSERT(attr2.f(one) == 5);
}
void internedStringsTests () {
JIT_ASSERT(kParam == Symbol("Param"));
JIT_ASSERT(kReturn == Symbol("Return"));
JIT_ASSERT(Symbol(kReturn).toString() == std::string("Return"));
size_t symstart = Symbol("__NEW_SYMBOL");
JIT_ASSERT(Symbol("What") == symstart+1);
JIT_ASSERT(Symbol("What2") == symstart+2);
JIT_ASSERT(Symbol("What") == symstart+1);
JIT_ASSERT(Symbol("What2") == symstart+2);
JIT_ASSERT(Symbol(symstart+2).toString() == std::string("What2"));
}
at::Tensor t_use(at::Tensor x) {
return x;
}
at::Tensor t_def(at::Tensor x) {
return x.t();
}
// given the difference of output vs expected tensor, check whether the
// difference is within a relative tolerance range. This is a standard way of
// matching tensor values upto certain precision
bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
double maxValue = 0.0;
for (auto& tensor : inputs) {
maxValue = fmax(tensor.abs().max().toCFloat(), maxValue);
}
return diff.abs().max().toCFloat() < 2e-6 * maxValue;
}
bool almostEqual(const at::Tensor & a, const at::Tensor & b) {
return checkRtol(a - b,{a, b});
}
bool exactlyEqual(const at::Tensor & a, const at::Tensor & b) {
return (a - b).abs().max().toCFloat() == 0.f;
}
std::pair<at::Tensor, at::Tensor>
lstm(at::Tensor input,
at::Tensor hx,
at::Tensor cx,
at::Tensor w_ih,
at::Tensor w_hh) {
auto gates = input.mm(t_use(w_ih)) + hx.mm(t_use(w_hh));
auto chunked_gates = gates.chunk(4, 1);
auto ingate = chunked_gates[0];
auto forgetgate = chunked_gates[1];
auto cellgate = chunked_gates[2];
auto outgate = chunked_gates[3];
ingate = ingate.sigmoid();
outgate = outgate.sigmoid();
cellgate = cellgate.tanh();
forgetgate = forgetgate.sigmoid();
auto cy = (forgetgate * cx) + (ingate * cellgate);
auto hy = outgate * cy.tanh();
return {hy, cy};
}
std::tuple<Var, Var> build_lstm_body(
Graph & g,
Var input,
Var hx,
Var cx,
Var w_ih,
Var w_hh) {
auto gates = input.mm(w_ih) + hx.mm(w_hh);
auto outputs = gates.chunk(4, 1);
auto ingate = outputs[0];
auto forgetgate = outputs[1];
auto cellgate = outputs[2];
auto outgate = outputs[3];
ingate = ingate.sigmoid();
outgate = outgate.sigmoid();
cellgate = cellgate.tanh();
forgetgate = forgetgate.sigmoid();
auto cy = forgetgate*cx + ingate*cellgate;
auto hy = outgate*cy.tanh();
return std::make_tuple(hy,cy);
}
std::shared_ptr<Graph> build_lstm() {
auto r = std::make_shared<Graph>();
auto & g = *r;
Value * input = g.addInput();
Value * hx = g.addInput();
Value * cx = g.addInput();
Value * w_ih = g.addInput();
Value * w_hh = g.addInput();
Var hy;
Var cy;
std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
hy.addAsOutput();
cy.addAsOutput();
g.lint();
return r;
}
std::shared_ptr<Graph> build_lstm_stages() {
auto r = std::make_shared<Graph>();
auto & g = *r;
Var input = g.addInput();
Var hx = g.addInput();
Var cx = g.addInput();
Var w_ih = g.addInput();
Var w_hh = g.addInput();
Var hy;
Var cy;
std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
// use some stuff from the previous stage as well
// as a new input
g.advanceStage();
hx = hy;
cy.addAsOutput();
cx = g.addInput();
std::tie(hy,cy) = build_lstm_body(g, input, hx, cx, w_ih, w_hh);
hy.addAsOutput();
cy.addAsOutput();
g.lint();
return r;
}
void interpTest() {
constexpr int batch_size = 4;
constexpr int input_size = 256;
constexpr int seq_len = 32;
int hidden_size = 2*input_size;
auto input = at::CUDA(at::kFloat).randn({seq_len, batch_size, input_size});
auto hx = at::CUDA(at::kFloat).randn({batch_size, hidden_size});
auto cx = at::CUDA(at::kFloat).randn({batch_size, hidden_size});
auto w_ih = t_def(at::CUDA(at::kFloat).randn({4 * hidden_size, input_size}));
auto w_hh = t_def(at::CUDA(at::kFloat).randn({4 * hidden_size, hidden_size}));
auto lstm_g = build_lstm();
Code lstm_function(lstm_g);
std::vector<at::Tensor> outputs;
InterpreterState lstm_interp(lstm_function);
lstm_interp.runOneStage({input[0], hx, cx, w_ih, w_hh}, outputs);
std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
//std::cout << almostEqual(outputs[0],hx) << "\n";
JIT_ASSERT(exactlyEqual(outputs[0],hx));
JIT_ASSERT(exactlyEqual(outputs[1],cx));
}
void interpStageTest() {
constexpr int batch_size = 4;
constexpr int input_size = 256;
constexpr int seq_len = 32;
int hidden_size = 2*input_size;
auto input = at::CUDA(at::kFloat).randn({seq_len, batch_size, input_size});
auto hx = at::CUDA(at::kFloat).randn({batch_size, hidden_size});
auto cx = at::CUDA(at::kFloat).randn({batch_size, hidden_size});
auto cx1 = at::CUDA(at::kFloat).randn({batch_size, hidden_size});
auto w_ih = t_def(at::CUDA(at::kFloat).randn({4 * hidden_size, input_size}));
auto w_hh = t_def(at::CUDA(at::kFloat).randn({4 * hidden_size, hidden_size}));
auto lstm_g = build_lstm_stages();
Code lstm_function(lstm_g);
std::vector<at::Tensor> outputs;
InterpreterState lstm_interp(lstm_function);
lstm_interp.runOneStage({input[0], hx, cx, w_ih, w_hh}, outputs);
auto cy0 = outputs[0];
lstm_interp.runOneStage({cx1}, outputs);
at::Tensor ihx = outputs[0];
at::Tensor icx = outputs[1];
std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
std::tie(hx, cx) = lstm(input[0], hx, cx1, w_ih, w_hh);
//std::cout << almostEqual(outputs[0],hx) << "\n";
JIT_ASSERT(exactlyEqual(outputs[0],hx));
JIT_ASSERT(exactlyEqual(outputs[1],cx));
}
void runJITCPPTests() {
interpTest();
interpStageTest();
codeTemplateTest();
fusionTests();
attributesTest();
internedStringsTests();
}
}}