17#ifndef TENSORRT_BUFFERS_H
18#define TENSORRT_BUFFERS_H
20#include "/opt/MagAOX/vendor/TensorRT-10.0.0.6/include/NvInfer.h"
23#include <unordered_map>
24#include <unordered_set>
25#include <cuda_runtime_api.h>
35#define CHECK(status) \
38 auto ret = (status); \
41 std::cerr << "Cuda failure: " << ret << std::endl; \
47template <
typename A,
typename B>
50 return (x + n - 1) / n;
53template <
typename T1,
typename T2>
56 static_assert(std::is_integral<T1>::value && std::is_integral<T2>::value,
"arguments must be integers");
57 static_assert(std::is_signed<T1>::value == std::is_signed<T2>::value,
"mixed signedness not allowed");
58 static_assert(
sizeof(T1) >=
sizeof(T2),
"first type must be as least as wide as second type");
59 return ((m + n - 1) / n) * n;
62inline int64_t
volume(nvinfer1::Dims
const& d)
64 return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
72 case nvinfer1::DataType::kINT64:
return 8;
73 case nvinfer1::DataType::kINT32:
74 case nvinfer1::DataType::kFLOAT:
return 4;
75 case nvinfer1::DataType::kBF16:
76 case nvinfer1::DataType::kHALF:
return 2;
77 case nvinfer1::DataType::kBOOL:
78 case nvinfer1::DataType::kUINT8:
79 case nvinfer1::DataType::kINT8:
80 case nvinfer1::DataType::kFP8:
return 1;
81 case nvinfer1::DataType::kINT4:
return 0;
102template <
typename AllocFunc,
typename FreeFunc>
127 throw std::bad_alloc();
139 buf.mType = nvinfer1::DataType::kFLOAT;
140 buf.mBuffer =
nullptr;
155 buf.mBuffer =
nullptr;
203 throw std::bad_alloc{};
235 return cudaMalloc(ptr, size) == cudaSuccess;
254 return *ptr !=
nullptr;
298 std::shared_ptr<nvinfer1::ICudaEngine> engine, std::vector<int64_t>
const& volumes, int32_t batchSize = 0)
303 for (int32_t i = 0; i <
mEngine->getNbIOTensors(); i++)
305 auto const name = engine->getIOTensorName(i);
308 nvinfer1::DataType type =
mEngine->getTensorDataType(name);
312 manBuf->hostBuffer =
HostBuffer(volumes[i], type);
313 void* deviceBuffer = manBuf->deviceBuffer.data();
322 BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, int32_t
const batchSize = 0,
323 nvinfer1::IExecutionContext
const* context =
nullptr)
328 for (int32_t i = 0, e =
mEngine->getNbIOTensors(); i < e; i++)
330 auto const name = engine->getIOTensorName(i);
333 auto dims = context ? context->getTensorShape(name) :
mEngine->getTensorShape(name);
335 nvinfer1::DataType type =
mEngine->getTensorDataType(name);
336 int32_t vecDim =
mEngine->getTensorVectorizedDim(name);
339 int32_t scalarsPerVec =
mEngine->getTensorComponentsPerElement(name);
340 dims.d[vecDim] =
divUp(dims.d[vecDim], scalarsPerVec);
341 vol *= scalarsPerVec;
347 void* deviceBuffer = manBuf->deviceBuffer.data();
392 size_t size(std::string
const& tensorName)
const
394 auto record =
mNames.find(tensorName);
395 if (record ==
mNames.end())
405 template <
typename T>
406 void print(std::ostream& os,
void* buf,
size_t bufSize,
size_t rowCount)
408 assert(rowCount != 0);
409 assert(bufSize %
sizeof(T) == 0);
410 T* typedBuf =
static_cast<T*
>(buf);
411 size_t numItems = bufSize /
sizeof(T);
412 for (int32_t i = 0; i < static_cast<int>(numItems); i++)
415 if (rowCount == 1 && i !=
static_cast<int>(numItems) - 1)
416 os << typedBuf[i] << std::endl;
417 else if (rowCount == 1)
420 else if (i % rowCount == 0)
422 else if (i % rowCount == rowCount - 1)
423 os <<
" " << typedBuf[i] << std::endl;
425 os <<
" " << typedBuf[i];
464 void*
getBuffer(
bool const isHost, std::string
const& tensorName)
const
466 auto record =
mNames.find(tensorName);
467 if (record ==
mNames.end())
475 return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT;
478 void memcpyBuffers(
bool const copyInput,
bool const deviceToHost,
bool const async, cudaStream_t
const& stream = 0)
480 for (
auto const& n :
mNames)
482 void* dstPtr = deviceToHost ?
mManagedBuffers[n.second]->hostBuffer.data()
484 void const* srcPtr = deviceToHost ?
mManagedBuffers[n.second]->deviceBuffer.data()
486 size_t const byteSize =
mManagedBuffers[n.second]->hostBuffer.nbBytes();
487 const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
491 CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
493 CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));
498 std::shared_ptr<nvinfer1::ICudaEngine>
mEngine;
502 std::unordered_map<std::string, int32_t>
mNames;
uint32_t getElementSize(nvinfer1::DataType t) noexcept
GenericBuffer< HostAllocator, HostFree > HostBuffer
A divUp(A x, B n)
Return m rounded up to nearest multiple of n.
GenericBuffer< DeviceAllocator, DeviceFree > DeviceBuffer
int64_t volume(nvinfer1::Dims const &d)
DeviceBuffer deviceBuffer
The ManagedBuffer class groups together a pair of corresponding device and host buffers.
The BufferManager class handles host and device buffer allocation and deallocation.
std::vector< void * > const & getDeviceBindings() const
Returns a vector of device buffers.
std::vector< std::unique_ptr< ManagedBuffer > > mManagedBuffers
The vector of pointers to managed buffers.
BufferManager(std::shared_ptr< nvinfer1::ICudaEngine > engine, std::vector< int64_t > const &volumes, int32_t batchSize=0)
Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes are ...
std::shared_ptr< nvinfer1::ICudaEngine > mEngine
The pointer to the engine.
void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const &stream=0)
size_t size(std::string const &tensorName) const
Returns the size of the host and device buffers that correspond to tensorName. Returns kINVALID_SIZE_...
void copyOutputToHostAsync(cudaStream_t const &stream=0)
Copy the contents of output device buffers to output host buffers asynchronously.
void print(std::ostream &os, void *buf, size_t bufSize, size_t rowCount)
Templated print function that dumps buffers of arbitrary type to std::ostream. rowCount parameter con...
void * getBuffer(bool const isHost, std::string const &tensorName) const
void * getHostBuffer(std::string const &tensorName) const
Returns the host buffer corresponding to tensorName. Returns nullptr if no such tensor can be found.
static const size_t kINVALID_SIZE_VALUE
void copyOutputToHost()
Copy the contents of output device buffers to output host buffers synchronously.
std::unordered_map< std::string, int32_t > mNames
The map of tensor name and index pairs.
std::vector< void * > mDeviceBindings
The vector of device buffers needed for engine execution.
void copyInputToDevice()
Copy the contents of input host buffers to input device buffers synchronously.
std::vector< void * > & getDeviceBindings()
Returns a vector of device buffers that you can use directly as bindings for the execute and enqueue ...
bool tenosrIsInput(const std::string &tensorName) const
BufferManager(std::shared_ptr< nvinfer1::ICudaEngine > engine, int32_t const batchSize=0, nvinfer1::IExecutionContext const *context=nullptr)
Create a BufferManager for handling buffer interactions with engine.
int mBatchSize
The batch size for legacy networks, 0 otherwise.
void copyInputToDeviceAsync(cudaStream_t const &stream=0)
Copy the contents of input host buffers to input device buffers asynchronously.
void * getDeviceBuffer(std::string const &tensorName) const
Returns the device buffer corresponding to tensorName. Returns nullptr if no such tensor can be found...
bool operator()(void **ptr, size_t size) const
void operator()(void *ptr) const
The GenericBuffer class is a templated class for buffers.
GenericBuffer(GenericBuffer &&buf)
GenericBuffer(size_t size, nvinfer1::DataType type)
Construct a buffer with the specified allocation size in bytes.
const void * data() const
Returns pointer to underlying array.
size_t nbBytes() const
Returns the size (in bytes) of the buffer.
void * data()
Returns pointer to underlying array.
void resize(size_t newSize)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
GenericBuffer(nvinfer1::DataType type=nvinfer1::DataType::kFLOAT)
Construct an empty buffer.
size_t size() const
Returns the size (in number of elements) of the buffer.
void resize(const nvinfer1::Dims &dims)
Overload of resize that accepts Dims.
GenericBuffer & operator=(GenericBuffer &&buf)
bool operator()(void **ptr, size_t size) const
void operator()(void *ptr) const