sglang_v0.5.2/vision_0.22.1/torchvision/csrc/io/decoder/defs.h

405 lines
13 KiB
C++

#pragma once
#include <array>
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <unordered_set>
#include <vector>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavutil/avutil.h>
#include <libavutil/imgutils.h>
#include <libswresample/swresample.h>
#include "libswscale/swscale.h"
}
namespace ffmpeg {
// bit mask of formats, keep them in form 2^n
enum MediaType : size_t {
TYPE_AUDIO = 1,
TYPE_VIDEO = 2,
TYPE_SUBTITLE = 4,
TYPE_CC = 8, // closed captions from transport streams
};
// audio
struct AudioFormat {
// fields are initialized for the auto detection
// caller can specify some/all of field values if specific output is desirable
bool operator==(const AudioFormat& x) const {
return x.format == format && x.samples == samples && x.channels == channels;
}
size_t samples{0}; // number samples per second (frequency)
size_t channels{0}; // number of channels
long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
size_t padding[2];
// -- alignment 40 bytes
};
// video
struct VideoFormat {
// fields are initialized for the auto detection
// caller can specify some/all of field values if specific output is desirable
bool operator==(const VideoFormat& x) const {
return x.format == format && x.width == width && x.height == height;
}
/*
When width = 0, height = 0, minDimension = 0, and maxDimension = 0,
keep the original frame resolution
When width = 0, height = 0, minDimension != 0, and maxDimension = 0,
keep the aspect ratio and resize the frame so that shorter edge size is
minDimension
When width = 0, height = 0, minDimension = 0, and maxDimension != 0,
keep the aspect ratio and resize the frame so that longer edge size is
maxDimension
When width = 0, height = 0, minDimension != 0, and maxDimension != 0,
resize the frame so that shorter edge size is minDimension, and
longer edge size is maxDimension. The aspect ratio may not be preserved
When width = 0, height != 0, minDimension = 0, and maxDimension = 0,
keep the aspect ratio and resize the frame so that frame height is $height
When width != 0, height = 0, minDimension = 0, and maxDimension = 0,
keep the aspect ratio and resize the frame so that frame width is $width
When width != 0, height != 0, minDimension = 0, and maxDimension = 0,
resize the frame so that frame width and height are set to $width and
$height,
respectively
*/
size_t width{0}; // width in pixels
size_t height{0}; // height in pixels
long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
size_t minDimension{0}; // choose min dimension and rescale accordingly
size_t maxDimension{0}; // choose max dimension and rescale accordingly
size_t cropImage{0}; // request image crop
// -- alignment 40 bytes
};
// subtitle/cc
struct SubtitleFormat {
long type{0}; // AVSubtitleType, auto SUBTITLE_NONE
size_t padding[4];
// -- alignment 40 bytes
};
union FormatUnion {
FormatUnion() : audio() {}
explicit FormatUnion(int) : video() {}
explicit FormatUnion(char) : subtitle() {}
explicit FormatUnion(double) : subtitle() {}
AudioFormat audio;
VideoFormat video;
SubtitleFormat subtitle;
// -- alignment 40 bytes
};
/*
MediaFormat data structure serves as input/output parameter.
Caller assigns values for input formats
or leave default values for auto detection
For output formats all fields will be set to the specific values
*/
struct MediaFormat {
// for using map/set data structures
bool operator<(const MediaFormat& x) const {
return type < x.type;
}
bool operator==(const MediaFormat& x) const {
if (type != x.type) {
return false;
}
switch (type) {
case TYPE_AUDIO:
return format.audio == x.format.audio;
case TYPE_VIDEO:
return format.video == x.format.video;
case TYPE_SUBTITLE:
case TYPE_CC:
return true;
default:
return false;
}
}
explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {}
explicit MediaFormat(int x, long s = -1)
: type(TYPE_VIDEO), stream(s), format(x) {}
explicit MediaFormat(char x, long s = -1)
: type(TYPE_SUBTITLE), stream(s), format(x) {}
explicit MediaFormat(double x, long s = -1)
: type(TYPE_CC), stream(s), format(x) {}
static MediaFormat makeMediaFormat(AudioFormat format, long stream) {
MediaFormat result(stream);
result.format.audio = format;
return result;
}
static MediaFormat makeMediaFormat(VideoFormat format, long stream) {
MediaFormat result(0, stream);
result.format.video = format;
return result;
}
static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) {
MediaFormat result('0', stream);
result.format.subtitle = format;
return result;
}
// format type
MediaType type;
// stream index:
// set -1 for one stream auto detection, -2 for all streams auto detection,
// >= 0, specified stream, if caller knows the stream index (unlikely)
long stream;
// union keeps one of the possible formats, defined by MediaType
FormatUnion format;
};
struct DecoderParameters {
// local file, remote file, http url, rtmp stream uri, etc. anything that
// ffmpeg can recognize
std::string uri{std::string()};
// timeout on getting bytes for decoding
size_t timeoutMs{1000};
// logging level, default AV_LOG_PANIC
long logLevel{0};
// when decoder would give up, 0 means never
size_t maxPackageErrors{0};
// max allowed consecutive times no bytes are processed. 0 means for infinite.
size_t maxProcessNoBytes{0};
// start offset (us)
long startOffset{0};
// end offset (us)
long endOffset{-1};
// logging id
int64_t loggingUuid{0};
// internal max seekable buffer size
size_t maxSeekableBytes{0};
// adjust header pts to the epoch time
bool convertPtsToWallTime{false};
// indicate if input stream is an encoded image
bool isImage{false};
// listen and wait for new rtmp stream
bool listen{false};
// don't copy frame body, only header
bool headerOnly{false};
// enable fast seek (seek only to keyframes)
bool fastSeek{false};
// interrupt init method on timeout
bool preventStaleness{true};
// seek tolerated accuracy (us)
double seekAccuracy{1000000.0};
// Allow multithreaded decoding for numThreads > 1;
// 0 numThreads=0 sets up sensible defaults
int numThreads{1};
// what media types should be processed, default none
std::set<MediaFormat> formats;
// can be used for asynchronous decoders
size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes
size_t cacheTimeoutMs{1000}; // timeout on bytes writing
bool enforceCacheSize{false}; // drop output frames if cache is full
bool mergeAudioMessages{false}; // combine collocated audio messages together
std::string tlsCertFile;
std::string tlsKeyFile;
// Skip packets that fail with EPERM errors and continue decoding.
bool skipOperationNotPermittedPackets{false};
// probing size in bytes, i.e. the size of the data to analyze to get stream
// information. A higher value will enable detecting more information in case
// it is dispersed into the stream, but will increase latency. Must be an
// integer not lesser than 32. It is 5000000 by default.
int64_t probeSize{5000000};
};
struct DecoderHeader {
// message id, from 0 till ...
size_t seqno{0};
// decoded timestamp in microseconds from either beginning of the stream or
// from epoch time, see DecoderParameters::convertPtsToWallTime
long pts{0};
// decoded key frame
size_t keyFrame{0};
// frames per second, valid only for video streams
double fps{0};
// format specifies what kind frame is in a payload
MediaFormat format;
};
// Abstract interface ByteStorage class
class ByteStorage {
public:
virtual ~ByteStorage() = default;
// makes sure that buffer has at least n bytes available for writing, if not
// storage must reallocate memory.
virtual void ensure(size_t n) = 0;
// caller must not to write more than available bytes
virtual uint8_t* writableTail() = 0;
// caller confirms that n bytes were written to the writable tail
virtual void append(size_t n) = 0;
// caller confirms that n bytes were read from the read buffer
virtual void trim(size_t n) = 0;
// gives an access to the beginning of the read buffer
virtual const uint8_t* data() const = 0;
// returns the stored size in bytes
virtual size_t length() const = 0;
// returns available capacity for writable tail
virtual size_t tail() const = 0;
// clears content, keeps capacity
virtual void clear() = 0;
};
struct DecoderOutputMessage {
DecoderHeader header;
std::unique_ptr<ByteStorage> payload;
};
/*
* External provider of the ecnoded bytes, specific implementation is left for
* different use cases, like file, memory, external network end-points, etc.
* Normally input/output parameter @out set to valid, not null buffer pointer,
* which indicates "read" call, however there are "seek" modes as well.
* @out != nullptr => read from the current offset, @whence got ignored,
* @size bytes to read => return number bytes got read, 0 if no more bytes
* available, < 0 on error.
* @out == nullptr, @timeoutMs == 0 => does provider support "seek"
* capability in a first place? @size & @whence got ignored, return 0 on
* success, < 0 if "seek" mode is not supported.
* @out == nullptr, @timeoutMs != 0 => normal seek call
* offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE)
* return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END],
* length of buffer if @whence = [AVSEEK_SIZE].
*/
using DecoderInCallback =
std::function<int(uint8_t* out, int size, int whence, uint64_t timeoutMs)>;
using DecoderOutCallback = std::function<void(DecoderOutputMessage&&)>;
struct DecoderMetadata {
// time base numerator
long num{0};
// time base denominator
long den{1};
// duration of the stream, in miscroseconds, if available
long duration{-1};
// frames per second, valid only for video streams
double fps{0};
// format specifies what kind frame is in a payload
MediaFormat format;
};
/**
* Abstract class for decoding media bytes
* It has two different modes. Internal media bytes retrieval for given uri and
* external media bytes provider in case of memory streams
*/
class MediaDecoder {
public:
virtual ~MediaDecoder() = default;
/**
* Initializes media decoder with parameters,
* calls callback when media bytes are available.
* Media bytes get fetched internally from provided URI
* or invokes provided input callback to get media bytes.
* Input callback must be empty for the internal media provider
* Caller can provide non-null pointer for the input container
* if headers to obtain the streams metadata (optional)
*/
virtual bool init(
const DecoderParameters& params,
DecoderInCallback&& in,
std::vector<DecoderMetadata>* metadata) = 0;
/**
* Polls available decoded one frame from decoder
* Returns error code, 0 - for success
*/
virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0;
/**
* Polls available decoded bytes from decoder, till EOF or error
*/
virtual int decode_all(const DecoderOutCallback& callback) = 0;
/**
* Stops calling callback, releases resources
*/
virtual void shutdown() = 0;
/**
* Interrupts whatever decoder is doing at any time
*/
virtual void interrupt() = 0;
/**
* Factory to create ByteStorage class instances, particular implementation is
* left to the derived class. Caller provides the initially allocated size
*/
virtual std::unique_ptr<ByteStorage> createByteStorage(size_t n) = 0;
};
struct SamplerParameters {
MediaType type{TYPE_AUDIO};
FormatUnion in;
FormatUnion out;
int64_t loggingUuid{0};
};
/**
* Abstract class for sampling media bytes
*/
class MediaSampler {
public:
virtual ~MediaSampler() = default;
/**
* Initializes media sampler with parameters
*/
virtual bool init(const SamplerParameters& params) = 0;
/**
* Samples media bytes
* Returns error code < 0, or >=0 - for success, indicating number of bytes
* processed.
* set @in to null for flushing data
*/
virtual int sample(const ByteStorage* in, ByteStorage* out) = 0;
/**
* Releases resources
*/
virtual void shutdown() = 0;
/*
* Returns media type
*/
MediaType getMediaType() const {
return params_.type;
}
/*
* Returns formats
*/
FormatUnion getInputFormat() const {
return params_.in;
}
FormatUnion getOutFormat() const {
return params_.out;
}
protected:
SamplerParameters params_;
};
} // namespace ffmpeg