docs/api/buffers_8hpp_source.html

/*

 * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 * SPDX-License-Identifier: Apache-2.0

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 * http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

#ifndef TENSORRT_BUFFERS_H

#define TENSORRT_BUFFERS_H


#include "/opt/MagAOX/vendor/TensorRT-10.0.0.6/include/NvInfer.h"

//#include "common.h"

#include <cassert>

#include <unordered_map>

#include <unordered_set>

#include <cuda_runtime_api.h>

#include <iostream>

#include <iterator>

#include <memory>

#include <new>

#include <numeric>

#include <string>

#include <vector>


#undef CHECK


#define CHECK(status)                                                                                                  \

    do                                                                                                                 \

    {                                                                                                                  \

        auto ret = (status);                                                                                           \

        if (ret != 0)                                                                                                  \

        {                                                                                                              \

            std::cerr << "Cuda failure: " << ret << std::endl;                                                         \

            exit(EXIT_FAILURE);                                                                                        \

        }                                                                                                              \

    } while (0)


//! Return m rounded up to nearest multiple of n

template <typename A, typename B>


inline A divUp(A x, B n)

{

    return (x + n - 1) / n;

}


template <typename T1, typename T2>


inline T1 roundUp(T1 m, T2 n)

{

    static_assert(std::is_integral<T1>::value && std::is_integral<T2>::value, "arguments must be integers");

    static_assert(std::is_signed<T1>::value == std::is_signed<T2>::value, "mixed signedness not allowed");

    static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type");

    return ((m + n - 1) / n) * n;

}


inline int64_t volume(nvinfer1::Dims const& d)

{

    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});

}


inline uint32_t getElementSize(nvinfer1::DataType t) noexcept

{

    switch (t)

    {

    case nvinfer1::DataType::kINT64: return 8;

    case nvinfer1::DataType::kINT32:

    case nvinfer1::DataType::kFLOAT: return 4;

    case nvinfer1::DataType::kBF16:

    case nvinfer1::DataType::kHALF: return 2;

    case nvinfer1::DataType::kBOOL:

    case nvinfer1::DataType::kUINT8:

    case nvinfer1::DataType::kINT8:

    case nvinfer1::DataType::kFP8: return 1;

    case nvinfer1::DataType::kINT4: return 0;

    }

    return 0;

}


//!

//! \brief  The GenericBuffer class is a templated class for buffers.

//!

//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation,

//!          deallocation, querying of buffers on both the device and the host.

//!          It can handle data of arbitrary types because it stores byte buffers.

//!          The template parameters AllocFunc and FreeFunc are used for the

//!          allocation and deallocation of the buffer.

//!          AllocFunc must be a functor that takes in (void** ptr, size_t size)

//!          and returns bool. ptr is a pointer to where the allocated buffer address should be stored.

//!          size is the amount of memory in bytes to allocate.

//!          The boolean indicates whether or not the memory allocation was successful.

//!          FreeFunc must be a functor that takes in (void* ptr) and returns void.

//!          ptr is the allocated buffer address. It must work with nullptr input.

//!

template <typename AllocFunc, typename FreeFunc>


class GenericBuffer

{

public:

    //!

    //! \brief Construct an empty buffer.

    //!


    GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)

        : mSize(0)

        , mCapacity(0)

        , mType(type)

        , mBuffer(nullptr)

    {

    }


    //!

    //! \brief Construct a buffer with the specified allocation size in bytes.

    //!


    GenericBuffer(size_t size, nvinfer1::DataType type)

        : mSize(size)

        , mCapacity(size)

        , mType(type)

    {

        if (!allocFn(&mBuffer, this->nbBytes()))

        {

            throw std::bad_alloc();

        }

    }


    GenericBuffer(GenericBuffer&& buf)

        : mSize(buf.mSize)

        , mCapacity(buf.mCapacity)

        , mType(buf.mType)

        , mBuffer(buf.mBuffer)

    {

        buf.mSize = 0;

        buf.mCapacity = 0;

        buf.mType = nvinfer1::DataType::kFLOAT;

        buf.mBuffer = nullptr;

    }


    GenericBuffer& operator=(GenericBuffer&& buf)

    {

        if (this != &buf)

        {

            freeFn(mBuffer);

            mSize = buf.mSize;

            mCapacity = buf.mCapacity;

            mType = buf.mType;

            mBuffer = buf.mBuffer;

            // Reset buf.

            buf.mSize = 0;

            buf.mCapacity = 0;

            buf.mBuffer = nullptr;

        }

        return *this;

    }


    //!

    //! \brief Returns pointer to underlying array.

    //!


    void* data()

    {

        return mBuffer;

    }


    //!

    //! \brief Returns pointer to underlying array.

    //!


    const void* data() const

    {

        return mBuffer;

    }


    //!

    //! \brief Returns the size (in number of elements) of the buffer.

    //!


    size_t size() const

    {

        return mSize;

    }


    //!

    //! \brief Returns the size (in bytes) of the buffer.

    //!


    size_t nbBytes() const

    {

        return this->size() * getElementSize(mType);

    }


    //!

    //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

    //!


    void resize(size_t newSize)

    {

        mSize = newSize;

        if (mCapacity < newSize)

        {

            freeFn(mBuffer);

            if (!allocFn(&mBuffer, this->nbBytes()))

            {

                throw std::bad_alloc{};

            }

            mCapacity = newSize;

        }

    }


    //!

    //! \brief Overload of resize that accepts Dims

    //!


    void resize(const nvinfer1::Dims& dims)

    {

        return this->resize(volume(dims));

    }


    ~GenericBuffer()

    {

        freeFn(mBuffer);

    }


private:

    size_t mSize{0}, mCapacity{0};

    nvinfer1::DataType mType;

    void* mBuffer;

    AllocFunc allocFn;

    FreeFunc freeFn;

};


class DeviceAllocator

{

public:


    bool operator()(void** ptr, size_t size) const

    {

        return cudaMalloc(ptr, size) == cudaSuccess;

    }


};


class DeviceFree

{

public:


    void operator()(void* ptr) const

    {

        cudaFree(ptr);

    }


};


class HostAllocator

{

public:


    bool operator()(void** ptr, size_t size) const

    {

        *ptr = malloc(size);

        return *ptr != nullptr;

    }


};


class HostFree

{

public:


    void operator()(void* ptr) const

    {

        free(ptr);

    }


};


using DeviceBuffer = GenericBuffer<DeviceAllocator, DeviceFree>;

using HostBuffer = GenericBuffer<HostAllocator, HostFree>;


//!

//! \brief  The ManagedBuffer class groups together a pair of corresponding device and host buffers.

//!


class ManagedBuffer

{

public:

    DeviceBuffer deviceBuffer;

    HostBuffer hostBuffer;

};


//!

//! \brief  The BufferManager class handles host and device buffer allocation and deallocation.

//!

//! \details This RAII class handles host and device buffer allocation and deallocation,

//!          memcpy between host and device buffers to aid with inference,

//!          and debugging dumps to validate inference. The BufferManager class is meant to be

//!          used to simplify buffer management and any interactions between buffers and the engine.

//!


class BufferManager

{

public:

    static const size_t kINVALID_SIZE_VALUE = ~size_t(0);


    //!

    //! \brief Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes

    //! are provided

    //!


    BufferManager(

        std::shared_ptr<nvinfer1::ICudaEngine> engine, std::vector<int64_t> const& volumes, int32_t batchSize = 0)

        : mEngine(engine)

        , mBatchSize(batchSize)

    {

        // Create host and device buffers

        for (int32_t i = 0; i < mEngine->getNbIOTensors(); i++)

        {

            auto const name = engine->getIOTensorName(i);

            mNames[name] = i;


            nvinfer1::DataType type = mEngine->getTensorDataType(name);


            std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};

            manBuf->deviceBuffer = DeviceBuffer(volumes[i], type);

            manBuf->hostBuffer = HostBuffer(volumes[i], type);

            void* deviceBuffer = manBuf->deviceBuffer.data();

            mDeviceBindings.emplace_back(deviceBuffer);

            mManagedBuffers.emplace_back(std::move(manBuf));

        }

    }


    //!

    //! \brief Create a BufferManager for handling buffer interactions with engine.

    //!


    BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, int32_t const batchSize = 0,

        nvinfer1::IExecutionContext const* context = nullptr)

        : mEngine(engine)

        , mBatchSize(batchSize)

    {

        // Create host and device buffers

        for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++)

        {

            auto const name = engine->getIOTensorName(i);

            mNames[name] = i;


            auto dims = context ? context->getTensorShape(name) : mEngine->getTensorShape(name);

            size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);

            nvinfer1::DataType type = mEngine->getTensorDataType(name);

            int32_t vecDim = mEngine->getTensorVectorizedDim(name);

            if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector

            {

                int32_t scalarsPerVec = mEngine->getTensorComponentsPerElement(name);

                dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);

                vol *= scalarsPerVec;

            }

            vol *= volume(dims);

            std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};

            manBuf->deviceBuffer = DeviceBuffer(vol, type);

            manBuf->hostBuffer = HostBuffer(vol, type);

            void* deviceBuffer = manBuf->deviceBuffer.data();

            mDeviceBindings.emplace_back(deviceBuffer);

            mManagedBuffers.emplace_back(std::move(manBuf));

        }

    }


    //!

    //! \brief Returns a vector of device buffers that you can use directly as

    //!        bindings for the execute and enqueue methods of IExecutionContext.

    //!


    std::vector<void*>& getDeviceBindings()

    {

        return mDeviceBindings;

    }


    //!

    //! \brief Returns a vector of device buffers.

    //!


    std::vector<void*> const& getDeviceBindings() const

    {

        return mDeviceBindings;

    }


    //!

    //! \brief Returns the device buffer corresponding to tensorName.

    //!        Returns nullptr if no such tensor can be found.

    //!


    void* getDeviceBuffer(std::string const& tensorName) const

    {

        return getBuffer(false, tensorName);

    }


    //!

    //! \brief Returns the host buffer corresponding to tensorName.

    //!        Returns nullptr if no such tensor can be found.

    //!


    void* getHostBuffer(std::string const& tensorName) const

    {

        return getBuffer(true, tensorName);

    }


    //!

    //! \brief Returns the size of the host and device buffers that correspond to tensorName.

    //!        Returns kINVALID_SIZE_VALUE if no such tensor can be found.

    //!


    size_t size(std::string const& tensorName) const

    {

        auto record = mNames.find(tensorName);

        if (record == mNames.end())

            return kINVALID_SIZE_VALUE;

        return mManagedBuffers[record->second]->hostBuffer.nbBytes();

    }


    //!

    //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream.

    //!        rowCount parameter controls how many elements are on each line.

    //!        A rowCount of 1 means that there is only 1 element on each line.

    //!

    template <typename T>


    void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount)

    {

        assert(rowCount != 0);

        assert(bufSize % sizeof(T) == 0);

        T* typedBuf = static_cast<T*>(buf);

        size_t numItems = bufSize / sizeof(T);

        for (int32_t i = 0; i < static_cast<int>(numItems); i++)

        {

            // Handle rowCount == 1 case

            if (rowCount == 1 && i != static_cast<int>(numItems) - 1)

                os << typedBuf[i] << std::endl;

            else if (rowCount == 1)

                os << typedBuf[i];

            // Handle rowCount > 1 case

            else if (i % rowCount == 0)

                os << typedBuf[i];

            else if (i % rowCount == rowCount - 1)

                os << " " << typedBuf[i] << std::endl;

            else

                os << " " << typedBuf[i];

        }

    }


    //!

    //! \brief Copy the contents of input host buffers to input device buffers synchronously.

    //!


    void copyInputToDevice()

    {

        memcpyBuffers(true, false, false);

    }


    //!

    //! \brief Copy the contents of output device buffers to output host buffers synchronously.

    //!


    void copyOutputToHost()

    {

        memcpyBuffers(false, true, false);

    }


    //!

    //! \brief Copy the contents of input host buffers to input device buffers asynchronously.

    //!


    void copyInputToDeviceAsync(cudaStream_t const& stream = 0)

    {

        memcpyBuffers(true, false, true, stream);

    }


    //!

    //! \brief Copy the contents of output device buffers to output host buffers asynchronously.

    //!


    void copyOutputToHostAsync(cudaStream_t const& stream = 0)

    {

        memcpyBuffers(false, true, true, stream);

    }


    ~BufferManager() = default;


private:


    void* getBuffer(bool const isHost, std::string const& tensorName) const

    {

        auto record = mNames.find(tensorName);

        if (record == mNames.end())

            return nullptr;

        return (isHost ? mManagedBuffers[record->second]->hostBuffer.data()

                       : mManagedBuffers[record->second]->deviceBuffer.data());

    }


    bool tenosrIsInput(const std::string& tensorName) const

    {

        return mEngine->getTensorIOMode(tensorName.c_str()) == nvinfer1::TensorIOMode::kINPUT;

    }


    void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const& stream = 0)

    {

        for (auto const& n : mNames)

        {

            void* dstPtr = deviceToHost ? mManagedBuffers[n.second]->hostBuffer.data()

                                        : mManagedBuffers[n.second]->deviceBuffer.data();

            void const* srcPtr = deviceToHost ? mManagedBuffers[n.second]->deviceBuffer.data()

                                              : mManagedBuffers[n.second]->hostBuffer.data();

            size_t const byteSize = mManagedBuffers[n.second]->hostBuffer.nbBytes();

            const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;

            if ((copyInput && tenosrIsInput(n.first)) || (!copyInput && !tenosrIsInput(n.first)))

            {

                if (async)

                    CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));

                else

                    CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));

            }

        }

    }


    std::shared_ptr<nvinfer1::ICudaEngine> mEngine;              //!< The pointer to the engine

    int mBatchSize;                                              //!< The batch size for legacy networks, 0 otherwise.

    std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers

    std::vector<void*> mDeviceBindings;              //!< The vector of device buffers needed for engine execution

    std::unordered_map<std::string, int32_t> mNames; //!< The map of tensor name and index pairs

};


#endif // TENSORRT_BUFFERS_H

getElementSize
uint32_t getElementSize(nvinfer1::DataType t) noexcept
Definition buffers.hpp:68

HostBuffer
GenericBuffer< HostAllocator, HostFree > HostBuffer
Definition buffers.hpp:268

divUp
A divUp(A x, B n)
Return m rounded up to nearest multiple of n.
Definition buffers.hpp:48

ManagedBuffer::hostBuffer
HostBuffer hostBuffer
Definition buffers.hpp:277

CHECK
#define CHECK(status)
Definition buffers.hpp:35

DeviceBuffer
GenericBuffer< DeviceAllocator, DeviceFree > DeviceBuffer
Definition buffers.hpp:267

volume
int64_t volume(nvinfer1::Dims const &d)
Definition buffers.hpp:62

roundUp
T1 roundUp(T1 m, T2 n)
Definition buffers.hpp:54

ManagedBuffer::deviceBuffer
DeviceBuffer deviceBuffer
Definition buffers.hpp:276

ManagedBuffer
The ManagedBuffer class groups together a pair of corresponding device and host buffers.
Definition buffers.hpp:274

BufferManager
The BufferManager class handles host and device buffer allocation and deallocation.
Definition buffers.hpp:289

BufferManager::getDeviceBindings
std::vector< void * > const & getDeviceBindings() const
Returns a vector of device buffers.
Definition buffers.hpp:365

BufferManager::mManagedBuffers
std::vector< std::unique_ptr< ManagedBuffer > > mManagedBuffers
The vector of pointers to managed buffers.
Definition buffers.hpp:500

BufferManager::BufferManager
BufferManager(std::shared_ptr< nvinfer1::ICudaEngine > engine, std::vector< int64_t > const &volumes, int32_t batchSize=0)
Create a BufferManager for handling buffer interactions with engine, when the I/O tensor volumes are ...
Definition buffers.hpp:297

BufferManager::mEngine
std::shared_ptr< nvinfer1::ICudaEngine > mEngine
The pointer to the engine.
Definition buffers.hpp:498

BufferManager::memcpyBuffers
void memcpyBuffers(bool const copyInput, bool const deviceToHost, bool const async, cudaStream_t const &stream=0)
Definition buffers.hpp:478

BufferManager::size
size_t size(std::string const &tensorName) const
Returns the size of the host and device buffers that correspond to tensorName. Returns kINVALID_SIZE_...
Definition buffers.hpp:392

BufferManager::copyOutputToHostAsync
void copyOutputToHostAsync(cudaStream_t const &stream=0)
Copy the contents of output device buffers to output host buffers asynchronously.
Definition buffers.hpp:456

BufferManager::print
void print(std::ostream &os, void *buf, size_t bufSize, size_t rowCount)
Templated print function that dumps buffers of arbitrary type to std::ostream. rowCount parameter con...
Definition buffers.hpp:406

BufferManager::getBuffer
void * getBuffer(bool const isHost, std::string const &tensorName) const
Definition buffers.hpp:464

BufferManager::getHostBuffer
void * getHostBuffer(std::string const &tensorName) const
Returns the host buffer corresponding to tensorName. Returns nullptr if no such tensor can be found.
Definition buffers.hpp:383

BufferManager::~BufferManager
~BufferManager()=default

BufferManager::kINVALID_SIZE_VALUE
static const size_t kINVALID_SIZE_VALUE
Definition buffers.hpp:291

BufferManager::copyOutputToHost
void copyOutputToHost()
Copy the contents of output device buffers to output host buffers synchronously.
Definition buffers.hpp:440

BufferManager::mNames
std::unordered_map< std::string, int32_t > mNames
The map of tensor name and index pairs.
Definition buffers.hpp:502

BufferManager::mDeviceBindings
std::vector< void * > mDeviceBindings
The vector of device buffers needed for engine execution.
Definition buffers.hpp:501

BufferManager::copyInputToDevice
void copyInputToDevice()
Copy the contents of input host buffers to input device buffers synchronously.
Definition buffers.hpp:432

BufferManager::getDeviceBindings
std::vector< void * > & getDeviceBindings()
Returns a vector of device buffers that you can use directly as bindings for the execute and enqueue ...
Definition buffers.hpp:357

BufferManager::tenosrIsInput
bool tenosrIsInput(const std::string &tensorName) const
Definition buffers.hpp:473

BufferManager::BufferManager
BufferManager(std::shared_ptr< nvinfer1::ICudaEngine > engine, int32_t const batchSize=0, nvinfer1::IExecutionContext const *context=nullptr)
Create a BufferManager for handling buffer interactions with engine.
Definition buffers.hpp:322

BufferManager::mBatchSize
int mBatchSize
The batch size for legacy networks, 0 otherwise.
Definition buffers.hpp:499

BufferManager::copyInputToDeviceAsync
void copyInputToDeviceAsync(cudaStream_t const &stream=0)
Copy the contents of input host buffers to input device buffers asynchronously.
Definition buffers.hpp:448

BufferManager::getDeviceBuffer
void * getDeviceBuffer(std::string const &tensorName) const
Returns the device buffer corresponding to tensorName. Returns nullptr if no such tensor can be found...
Definition buffers.hpp:374

DeviceAllocator
Definition buffers.hpp:231

DeviceAllocator::operator()
bool operator()(void **ptr, size_t size) const
Definition buffers.hpp:233

DeviceFree
Definition buffers.hpp:240

DeviceFree::operator()
void operator()(void *ptr) const
Definition buffers.hpp:242

GenericBuffer
The GenericBuffer class is a templated class for buffers.
Definition buffers.hpp:104

GenericBuffer::mBuffer
void * mBuffer
Definition buffers.hpp:225

GenericBuffer::mCapacity
size_t mCapacity
Definition buffers.hpp:223

GenericBuffer::freeFn
FreeFunc freeFn
Definition buffers.hpp:227

GenericBuffer::GenericBuffer
GenericBuffer(GenericBuffer &&buf)
Definition buffers.hpp:131

GenericBuffer::GenericBuffer
GenericBuffer(size_t size, nvinfer1::DataType type)
Construct a buffer with the specified allocation size in bytes.
Definition buffers.hpp:120

GenericBuffer::data
const void * data() const
Returns pointer to underlying array.
Definition buffers.hpp:171

GenericBuffer::nbBytes
size_t nbBytes() const
Returns the size (in bytes) of the buffer.
Definition buffers.hpp:187

GenericBuffer::data
void * data()
Returns pointer to underlying array.
Definition buffers.hpp:163

GenericBuffer::mSize
size_t mSize
Definition buffers.hpp:223

GenericBuffer::allocFn
AllocFunc allocFn
Definition buffers.hpp:226

GenericBuffer::mType
nvinfer1::DataType mType
Definition buffers.hpp:224

GenericBuffer::resize
void resize(size_t newSize)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
Definition buffers.hpp:195

GenericBuffer::GenericBuffer
GenericBuffer(nvinfer1::DataType type=nvinfer1::DataType::kFLOAT)
Construct an empty buffer.
Definition buffers.hpp:109

GenericBuffer::size
size_t size() const
Returns the size (in number of elements) of the buffer.
Definition buffers.hpp:179

GenericBuffer::resize
void resize(const nvinfer1::Dims &dims)
Overload of resize that accepts Dims.
Definition buffers.hpp:212

GenericBuffer::~GenericBuffer
~GenericBuffer()
Definition buffers.hpp:217

GenericBuffer::operator=
GenericBuffer & operator=(GenericBuffer &&buf)
Definition buffers.hpp:143

HostAllocator
Definition buffers.hpp:249

HostAllocator::operator()
bool operator()(void **ptr, size_t size) const
Definition buffers.hpp:251

HostFree
Definition buffers.hpp:259

HostFree::operator()
void operator()(void *ptr) const
Definition buffers.hpp:261