|  | // Note(jiayq): the import_array function is done inside | 
|  | // caffe2_python.cc. Read | 
|  | // http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous | 
|  | // for more details. | 
|  |  | 
|  | #define NO_IMPORT_ARRAY | 
|  |  | 
|  | #include "pybind_state.h" | 
|  |  | 
|  | #include <pybind11/pybind11.h> | 
|  | #include <pybind11/stl.h> | 
|  |  | 
|  | #ifdef CAFFE2_USE_CUDNN | 
|  | #include "caffe2/core/common_cudnn.h" | 
|  | #endif // CAFFE2_USE_CUDNN | 
|  | #include <c10/cuda/CUDAGuard.h> | 
|  | #include "caffe2/core/context_gpu.h" | 
|  | #include "caffe2/operators/operator_fallback_gpu.h" | 
|  | #include "caffe2/python/pybind_state_registry.h" | 
|  |  | 
|  | #ifdef CAFFE2_USE_TRT | 
|  | #include "caffe2/contrib/tensorrt/tensorrt_tranformer.h" | 
|  | #endif // CAFFE2_USE_TRT | 
|  |  | 
|  | namespace caffe2 { | 
|  | namespace python { | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp); | 
|  | REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp); | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp); | 
|  | REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp); | 
|  |  | 
|  | REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>); | 
|  |  | 
|  | namespace py = pybind11; | 
|  |  | 
|  | void addCUDAGlobalMethods(py::module& m) { | 
|  | m.def("num_cuda_devices", &NumCudaDevices); | 
|  | m.def("get_cuda_version", &CudaVersion); | 
|  | #ifdef CAFFE2_USE_CUDNN | 
|  | m.def("get_cudnn_version", &cudnnCompiledVersion); | 
|  | m.attr("cudnn_convolution_fwd_algo_count") = | 
|  | py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT); | 
|  | m.attr("cudnn_convolution_bwd_data_algo_count") = | 
|  | py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT); | 
|  | m.attr("cudnn_convolution_bwd_filter_algo_count") = | 
|  | py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT); | 
|  | #else | 
|  | m.def("get_cudnn_version", []() { return static_cast<size_t>(0); }); | 
|  | m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0); | 
|  | m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0); | 
|  | m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0); | 
|  | #endif | 
|  | m.def("get_gpu_memory_info", [](int device_id) { | 
|  | CUDAGuard guard(device_id); | 
|  | size_t device_free, device_total; | 
|  | CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total)); | 
|  | return std::pair<size_t, size_t>{device_free, device_total}; | 
|  | }); | 
|  | m.def("get_cuda_peer_access_pattern", []() { | 
|  | std::vector<std::vector<bool>> pattern; | 
|  | CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern)); | 
|  | return pattern; | 
|  | }); | 
|  | m.def("get_device_properties", [](int deviceid) { | 
|  | auto& prop = GetDeviceProperty(deviceid); | 
|  | std::map<std::string, py::object> obj; | 
|  | obj["name"] = py::cast(prop.name); | 
|  | obj["major"] = py::cast(prop.major); | 
|  | obj["minor"] = py::cast(prop.minor); | 
|  | obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem); | 
|  | return obj; | 
|  | }); | 
|  | m.def( | 
|  | "onnx_to_trt_op", | 
|  | [](const py::bytes& onnx_model_str, | 
|  | const std::unordered_map<std::string, std::vector<int>>& | 
|  | output_size_hints, | 
|  | int max_batch_size, | 
|  | int max_workspace_size, | 
|  | int verbosity, | 
|  | bool debug_builder) -> py::bytes { | 
|  | #ifdef CAFFE2_USE_TRT | 
|  | TensorRTTransformer t( | 
|  | max_batch_size, max_workspace_size, verbosity, debug_builder); | 
|  | auto op_def = | 
|  | t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints); | 
|  | std::string out; | 
|  | op_def.SerializeToString(&out); | 
|  | return py::bytes(out); | 
|  | #else | 
|  | CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1"); | 
|  | #endif // CAFFE2_USE_TRT | 
|  | }); | 
|  | m.def( | 
|  | "transform_trt", | 
|  | [](const py::bytes& pred_net_str, | 
|  | const std::unordered_map<std::string, std::vector<int>>& shapes, | 
|  | int max_batch_size, | 
|  | int max_workspace_size, | 
|  | int verbosity, | 
|  | bool debug_builder, | 
|  | bool build_serializable_op) -> py::bytes { | 
|  | #ifdef CAFFE2_USE_TRT | 
|  | caffe2::NetDef pred_net; | 
|  | if (!ParseProtoFromLargeString( | 
|  | pred_net_str.cast<std::string>(), &pred_net)) { | 
|  | LOG(ERROR) << "broken pred_net protobuf"; | 
|  | } | 
|  | std::unordered_map<std::string, TensorShape> tensor_shapes; | 
|  | for (const auto& it : shapes) { | 
|  | tensor_shapes.emplace( | 
|  | it.first, CreateTensorShape(it.second, TensorProto::FLOAT)); | 
|  | } | 
|  | TensorRTTransformer ts( | 
|  | max_batch_size, | 
|  | max_workspace_size, | 
|  | verbosity, | 
|  | debug_builder, | 
|  | build_serializable_op); | 
|  | ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes); | 
|  | std::string pred_net_str2; | 
|  | pred_net.SerializeToString(&pred_net_str2); | 
|  | return py::bytes(pred_net_str2); | 
|  | #else | 
|  | CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1"); | 
|  | #endif // CAFFE2_USE_TRT | 
|  | }); | 
|  | }; | 
|  |  | 
|  | void addCUDAObjectMethods(py::module& m) { | 
|  | py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA") | 
|  | .def_property_readonly( | 
|  | "data", | 
|  | [](DLPackWrapper<CUDAContext>* t) -> py::object { | 
|  | CAFFE_ENFORCE_EQ( | 
|  | t->device_option.device_type(), | 
|  | PROTO_CUDA, | 
|  | "Expected CUDA device option for CUDA tensor"); | 
|  |  | 
|  | return t->data(); | 
|  | }, | 
|  | "Return DLPack tensor with tensor's data.") | 
|  | .def( | 
|  | "feed", | 
|  | [](DLPackWrapper<CUDAContext>* t, py::object obj) { | 
|  | CAFFE_ENFORCE_EQ( | 
|  | t->device_option.device_type(), | 
|  | PROTO_CUDA, | 
|  | "Expected CUDA device option for CUDA tensor"); | 
|  | t->feed(obj); | 
|  | }, | 
|  | "Copy data from given DLPack tensor into this tensor.") | 
|  | .def_property_readonly( | 
|  | "_shape", | 
|  | [](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); }) | 
|  | .def( | 
|  | "_reshape", | 
|  | [](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) { | 
|  | t->tensor->Resize(dims); | 
|  | }); | 
|  | } | 
|  |  | 
|  | PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) { | 
|  | m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition"; | 
|  |  | 
|  | addGlobalMethods(m); | 
|  | addCUDAGlobalMethods(m); | 
|  | addObjectMethods(m); | 
|  | addCUDAObjectMethods(m); | 
|  | for (const auto& addition : PybindAdditionRegistry()->Keys()) { | 
|  | PybindAdditionRegistry()->Create(addition, m); | 
|  | } | 
|  | } | 
|  | } // namespace python | 
|  | } // namespace caffe2 |