#include <assert.h> #include <fstream> #include <sstream> #include <iostream> #include < cmath> #include <sys/stat.h> #include <cmath> #include <time.h> #include <cuda_runtime_api.h > #include <unordered_map> #include <algorithm> #include <float.h> #include <string.h> # Include <chrono> #include <iterator> #include "NvInfer.h" #include "NvCaffeParser.h" #include "common.h" #in
Clude "BatchStream.h" #include "LegacyCalibrator.h" using namespace Nvinfer1;
using namespace Nvcaffeparser1;
Static Logger Glogger;
Stuff we know about the network and the Caffe input/output blobs const CHAR* = "Data";
Const char* output_blob_name = "prob";
Const char* GNETWORKNAME{NULLPTR};
std::string locatefile (const std::string& input) {std::vector<std::string> dirs;
Dirs.push_back (std::string ("data/int8/") + Gnetworkname + std::string ("/")); Dirs.push_back (std::string ("Data/") + Gnetworkname + std::string ("/"));
return LocateFile (input, dirs); BOOL Caffetogiemodel (const std::string& Deployfile,//name for Caffe Prototxt Const STD::S tring& Modelfile,//name for model const std::vector<std::string>& outputs, Network outputs unsigned int maxbatchsize,//batch SIZE-NB-must as at least as LA Rge as the batch we want to run with) DataType DataType, iint8calibrator* calibrator, Nvinfer1::ihostmemory *&
Amp;giemodelstream) {//Create the builder ibuilder* builder = Createinferbuilder (Glogger); Parse the Caffe model to populate the network, then set the outputs inetworkdefinition* = Network
Enetwork ();
icaffeparser* parser = Createcaffeparser (); if ((DataType = = Datatype::kint8 &&!builder->platformhasfastint8 ()) | | (DataType = = Datatype::khalf &&!builder->platformHASFASTFP16 ()) return false; Const iblobnametotensor* blobnametotensor = Parser->parse (LocateFile (deployfile). C_str (), LocateFile (ModelFile) . C_STR (), *network, DataType = = Datatype::kint8?
Datatype::kfloat:datatype); Specify which tensors are outputs for (auto& s:outputs) network->markoutput (*blobnametotensor->
Find (S.c_str ()));
Build the engine builder->setmaxbatchsize (maxbatchsize);
Builder->setmaxworkspacesize (1 << 30);
Builder->setaveragefinditerations (1);
Builder->setminfinditerations (1);
Builder->setdebugsync (TRUE);
Builder->setint8mode (DataType = = datatype::kint8);
Builder->sethalf2mode (DataType = = datatype::khalf);
Builder->setint8calibrator (calibrator);
icudaengine* engine = Builder->buildcudaengine (*network);
ASSERT (engine); We don ' t need the network any more, and we can destroy the parser Network->destroy ();
Parser->destroy ();
Serialize the engine, then close everything down Giemodelstream = Engine->serialize ();
Engine->destroy ();
Builder->destroy ();
return true; Float Doinference (iexecutioncontext& context, float* input, float* output, int batchsize) {Const ICUDAENGINE&A mp
engine = Context.getengine ();
Input and output buffer pointers that we pass to the Engine-the engine requires exactly iengine::getnbbindings (),
Of these, the But in the case we know the there is exactly one input and one output.
ASSERT (engine.getnbbindings () = = 2);
void* buffers[2];
float ms{0.0f};
In order to bind the buffers, we need to know the names of the input and output tensors. Note This indices are guaranteed to be less than iengine::getnbbindings () int inputindex = Engine.getbindingindex (I
Nput_blob_name), Outputindex = Engine.getbindingindex (output_blob_name);
Create GPU buffers and a stream DIMSCHW inputdims = static_cast<dimschw&&> (Context.getengine (). Getbindingdimensions (
Context.getengine (). Getbindingindex (Input_blob_name)); DIMSCHW outputdims = static_cast<dimschw&&> (Context.getengine (). Getbindingdimensions (
Context.getengine (). Getbindingindex (Output_blob_name)); size_t inputsize = BATCHSIZE*INPUTDIMS.C () *inputdims.h () *INPUTDIMS.W () * sizeof (float), outputsize = BatchSize *
OUTPUTDIMS.C () * OUTPUTDIMS.H () * OUTPUTDIMS.W () * sizeof (float);
CHECK (Cudamalloc (&buffers[inputindex], inputsize));
CHECK (Cudamalloc (&buffers[outputindex], outputsize));
CHECK (cudamemcpy (Buffers[inputindex], input, inputsize, cudamemcpyhosttodevice));
cudastream_t stream;
CHECK (Cudastreamcreate (&stream));
cudaevent_t start, end;
CHECK (Cudaeventcreatewithflags (&start, Cudaeventblockingsync));
CHECK (Cudaeventcreatewithflags (&end, Cudaeventblockingsync));
Cudaeventrecord (start, stream); Context.enqueUE (batchsize, buffers, stream, nullptr);
Cudaeventrecord (end, stream);
Cudaeventsynchronize (end);
Cudaeventelapsedtime (&ms, start, end);
Cudaeventdestroy (start);
Cudaeventdestroy (end);
CHECK (cudamemcpy (output, Buffers[outputindex], outputsize, cudamemcpydevicetohost));
CHECK (Cudafree (Buffers[inputindex]));
CHECK (Cudafree (Buffers[outputindex]));
CHECK (Cudastreamdestroy (stream));
return MS; int Calculatescore (float* batchprob, float* labels, int batchsize, int outputsize, int threshold) {int success =
0; for (int i = 0; i < batchsize i++) {float* prob = Batchprob + outputsize*i, correct = prob[(int) labels[i]
];
int better = 0;
for (int j = 0; J < Outputsize; J +) if (Prob[j] >= correct) better++;
if (better <= threshold) success++;
return success; Class Int8entropycalibrator:public Iint8entropycalibrator {public:int8entRopycalibrator (batchstream& stream, int firstbatch, bool Readcache = True): Mstream (Stream), Mreadcache (READC
Ache) {DIMSNCHW dims = Mstream.getdims ();
Minputcount = Mstream.getbatchsize () * DIMS.C () * DIMS.H () * DIMS.W ();
CHECK (Cudamalloc (&mdeviceinput, Minputcount * sizeof (float));
Mstream.reset (Firstbatch);
Virtual ~int8entropycalibrator () {CHECK (Cudafree (mdeviceinput));
int getbatchsize () const override {return mstream.getbatchsize ();}
BOOL Getbatch (void* bindings[], const char* names[], int nbbindings) override {if (!mstream.next ())
return false;
CHECK (cudamemcpy (Mdeviceinput, Mstream.getbatch (), Minputcount * sizeof (float), cudamemcpyhosttodevice));
ASSERT (!STRCMP (names[0], input_blob_name));
Bindings[0] = mdeviceinput;
return true; Const void* Readcalibrationcache (size_t& length) override {McalibratiOncache.clear ();
Std::ifstream input (Calibrationtablename (), std::ios::binary);
Input >> STD::NOSKIPWS; if (Mreadcache && input.good ()) std::copy (std::istream_iterator<char> (input), Std::istream_itera
Tor<char> (), Std::back_inserter (Mcalibrationcache));
Length = Mcalibrationcache.size (); return length?
&mcalibrationcache[0]: nullptr; } void Writecalibrationcache (const void* cache, size_t length) override {Std::ofstream output (calibratio
Ntablename (), std::ios::binary);
Output.write (Reinterpret_cast<const char*> (cache), length);
} private:static std::string Calibrationtablename () {assert (gnetworkname);
Return std::string ("calibrationtable") + gnetworkname;
} Batchstream Mstream;
BOOL mreadcache{true};
size_t Minputcount;
void* mdeviceinput{nullptr};
Std::vector<char> Mcalibrationcache;
}; std::p AIR<FLOat, float> scoremodel (int batchsize, int firstbatch, int nbscorebatches, DataType DataType, iint8calibrator* Calibrat
OR, bool quiet = False) {ihostmemory *giemodelstream{nullptr};
bool valid = FALSE; if (Gnetworkname = = std::string ("mnist")) valid = Caffetogiemodel ("Deploy.prototxt", "Mnist_lenet.caffemodel", std
:: Vector < std::string > {output_blob_name}, batchsize, datatype, calibrator, Giemodelstream); else valid = Caffetogiemodel ("Deploy.prototxt", std::string (gnetworkname) + ". Caffemodel", Std::vector < std::s
Tring > {output_blob_name}, batchsize, datatype, calibrator, Giemodelstream);
if (!valid) {std::cout << "Engine could not being created at this precision" << Std::endl;
Return std::p air<float, float> (0,0);
//Create engine and deserialize model.
iruntime* infer = Createinferruntime (Glogger); icudaengine* engine = Infer->deserializecudaengine (giemodelstream->Data (), Giemodelstream->size (), nullptr);
if (Giemodelstream) Giemodelstream->destroy ();
iexecutioncontext* context = Engine->createexecutioncontext ();
Batchstream Stream (batchsize, nbscorebatches);
Stream.skip (Firstbatch); DIMSCHW outputdims = static_cast<dimschw&&> (Context->getengine (). Getbindingdimensions (context-
>getengine (). Getbindingindex (Output_blob_name));
int outputsize = OUTPUTDIMS.C () *outputdims.h () *OUTPUTDIMS.W ();
int top1{0}, top5{0};
float totaltime{0.0f};
std::vector<float> Prob (batchsize * outputsize, 0);
while (Stream.next ()) {totaltime + = Doinference (*context, Stream.getbatch (), &prob[0], batchsize);
Top1 + + calculatescore (&prob[0], stream.getlabels (), BatchSize, outputsize, 1);
TOP5 + + calculatescore (&prob[0], stream.getlabels (), BatchSize, Outputsize, 5); Std::cout << (!quiet && stream.getbatchesread ()% 10 = 0? ".": "") << (!quiet && stream.getbatchesread ()% 800 = 0?)
\ n ":") << Std::flush;
int imagesread = Stream.getbatchesread () *batchsize;
float T1 = float (top1)/float (imagesread), T5 = float (top5)/float (imagesread);
if (!quiet) {std::cout << "\NTOP1: << T1 <<", TOP5: "<< T5 << Std::endl; Std::cout << "Processing" << imagesread << "Images averaged" << Totaltime/imagesread &
lt;< "Ms/image and" << totaltime/stream.getbatchesread () << "Ms/batch." << Std::endl;
} Context->destroy ();
Engine->destroy ();
Infer->destroy ();
return Std::make_pair (T1, T5); int main (int argc, char** argv) {if (ARGC < 2) {Std::cout << "please provide the network
As the argument << Std::endl;
Exit (0);
} gnetworkname = argv[1]; int batchsize = Firstscorebatch = m, NbSCorebatches = 400; By default we score over 40K images starting at 10000, so we don ' t score those used to search calibration bool Sear
ch = false;
Calibrationalgotype Calibrationalgo = calibrationalgotype::kentropy_calibration; for (int i = 2; i < argc i++) {if (!strncmp (argv[i), "Batch=", 6)) BatchSize = Atoi (Argv[i] +
6);
else if (!strncmp (Argv[i], "start=", 6)) Firstscorebatch = Atoi (Argv[i] + 6);
else if (!strncmp (Argv[i], "score=", 6)) Nbscorebatches = Atoi (Argv[i] + 6);
else if (!strncmp (Argv[i], "search", 6)) search = true;
else if (!strncmp (Argv[i], "legacy", 6)) Calibrationalgo = calibrationalgotype::klegacy_calibration;
else {std::cout << "unrecognized argument" << argv[i] << Std::endl;
Exit (0);
} if (Calibrationalgo = = calibrationalgotype::kentropy_calibration) { Search = false;
} if (BatchSize > 128) {std::cout << "Please provide batch size <= 128" << Std::endl;
Exit (0); } if ((Firstscorebatch + nbscorebatches) *batchsize > 500000) {std::cout << "only 50000 images A
vailable "<< Std::endl;
Exit (0);
} std::cout.precision (6);
Batchstream Calibrationstream (cal_batch_size, nb_cal_batches); Std::cout << "\nfp32 run:" << nbscorebatches << "Batches of size" << batchsize << "Starti
ng at "<< Firstscorebatch << Std::endl;
Scoremodel (BatchSize, Firstscorebatch, Nbscorebatches, Datatype::kfloat, nullptr); Std::cout << "\nfp16 run:" << nbscorebatches << "Batches of size" << batchsize << "Starti
ng at "<< Firstscorebatch << Std::endl;
Scoremodel (BatchSize, Firstscorebatch, Nbscorebatches, Datatype::khalf, nullptr); Std::cout << "\ninT8 run: "<< nbscorebatches <<" Batches of size "<< batchsize <<" starting at "<< FIRSTSC
Orebatch << Std::endl; if (Calibrationalgo = = calibrationalgotype::kentropy_calibration) {int8entropycalibrator calibrator (Calibrati
OnStream, First_cal_batch);
Scoremodel (BatchSize, Firstscorebatch, Nbscorebatches, Datatype::kint8, &calibrator);
else {std::p air<double, double> parameters = Getquantileandcutoff (gnetworkname, search);
Int8legacycalibrator calibrator (Calibrationstream, First_cal_batch, Parameters.first, Parameters.second);
Scoremodel (BatchSize, Firstscorebatch, Nbscorebatches, Datatype::kint8, &calibrator);
} shutdownprotobuflibrary ();
return 0; }