#pragma once #include #include #include #include #include #include #include extern "C" { #include #include #include #include #include #include #include "libswscale/swscale.h" } namespace ffmpeg { // bit mask of formats, keep them in form 2^n enum MediaType : size_t { TYPE_AUDIO = 1, TYPE_VIDEO = 2, TYPE_SUBTITLE = 4, TYPE_CC = 8, // closed captions from transport streams }; // audio struct AudioFormat { // fields are initialized for the auto detection // caller can specify some/all of field values if specific output is desirable bool operator==(const AudioFormat& x) const { return x.format == format && x.samples == samples && x.channels == channels; } size_t samples{0}; // number samples per second (frequency) size_t channels{0}; // number of channels long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE size_t padding[2]; // -- alignment 40 bytes }; // video struct VideoFormat { // fields are initialized for the auto detection // caller can specify some/all of field values if specific output is desirable bool operator==(const VideoFormat& x) const { return x.format == format && x.width == width && x.height == height; } /* When width = 0, height = 0, minDimension = 0, and maxDimension = 0, keep the original frame resolution When width = 0, height = 0, minDimension != 0, and maxDimension = 0, keep the aspect ratio and resize the frame so that shorter edge size is minDimension When width = 0, height = 0, minDimension = 0, and maxDimension != 0, keep the aspect ratio and resize the frame so that longer edge size is maxDimension When width = 0, height = 0, minDimension != 0, and maxDimension != 0, resize the frame so that shorter edge size is minDimension, and longer edge size is maxDimension. The aspect ratio may not be preserved When width = 0, height != 0, minDimension = 0, and maxDimension = 0, keep the aspect ratio and resize the frame so that frame height is $height When width != 0, height = 0, minDimension = 0, and maxDimension = 0, keep the aspect ratio and resize the frame so that frame width is $width When width != 0, height != 0, minDimension = 0, and maxDimension = 0, resize the frame so that frame width and height are set to $width and $height, respectively */ size_t width{0}; // width in pixels size_t height{0}; // height in pixels long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE size_t minDimension{0}; // choose min dimension and rescale accordingly size_t maxDimension{0}; // choose max dimension and rescale accordingly size_t cropImage{0}; // request image crop // -- alignment 40 bytes }; // subtitle/cc struct SubtitleFormat { long type{0}; // AVSubtitleType, auto SUBTITLE_NONE size_t padding[4]; // -- alignment 40 bytes }; union FormatUnion { FormatUnion() : audio() {} explicit FormatUnion(int) : video() {} explicit FormatUnion(char) : subtitle() {} explicit FormatUnion(double) : subtitle() {} AudioFormat audio; VideoFormat video; SubtitleFormat subtitle; // -- alignment 40 bytes }; /* MediaFormat data structure serves as input/output parameter. Caller assigns values for input formats or leave default values for auto detection For output formats all fields will be set to the specific values */ struct MediaFormat { // for using map/set data structures bool operator<(const MediaFormat& x) const { return type < x.type; } bool operator==(const MediaFormat& x) const { if (type != x.type) { return false; } switch (type) { case TYPE_AUDIO: return format.audio == x.format.audio; case TYPE_VIDEO: return format.video == x.format.video; case TYPE_SUBTITLE: case TYPE_CC: return true; default: return false; } } explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {} explicit MediaFormat(int x, long s = -1) : type(TYPE_VIDEO), stream(s), format(x) {} explicit MediaFormat(char x, long s = -1) : type(TYPE_SUBTITLE), stream(s), format(x) {} explicit MediaFormat(double x, long s = -1) : type(TYPE_CC), stream(s), format(x) {} static MediaFormat makeMediaFormat(AudioFormat format, long stream) { MediaFormat result(stream); result.format.audio = format; return result; } static MediaFormat makeMediaFormat(VideoFormat format, long stream) { MediaFormat result(0, stream); result.format.video = format; return result; } static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) { MediaFormat result('0', stream); result.format.subtitle = format; return result; } // format type MediaType type; // stream index: // set -1 for one stream auto detection, -2 for all streams auto detection, // >= 0, specified stream, if caller knows the stream index (unlikely) long stream; // union keeps one of the possible formats, defined by MediaType FormatUnion format; }; struct DecoderParameters { // local file, remote file, http url, rtmp stream uri, etc. anything that // ffmpeg can recognize std::string uri{std::string()}; // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC long logLevel{0}; // when decoder would give up, 0 means never size_t maxPackageErrors{0}; // max allowed consecutive times no bytes are processed. 0 means for infinite. size_t maxProcessNoBytes{0}; // start offset (us) long startOffset{0}; // end offset (us) long endOffset{-1}; // logging id int64_t loggingUuid{0}; // internal max seekable buffer size size_t maxSeekableBytes{0}; // adjust header pts to the epoch time bool convertPtsToWallTime{false}; // indicate if input stream is an encoded image bool isImage{false}; // listen and wait for new rtmp stream bool listen{false}; // don't copy frame body, only header bool headerOnly{false}; // enable fast seek (seek only to keyframes) bool fastSeek{false}; // interrupt init method on timeout bool preventStaleness{true}; // seek tolerated accuracy (us) double seekAccuracy{1000000.0}; // Allow multithreaded decoding for numThreads > 1; // 0 numThreads=0 sets up sensible defaults int numThreads{1}; // what media types should be processed, default none std::set formats; // can be used for asynchronous decoders size_t cacheSize{8192}; // mow many bytes to cache before stop reading bytes size_t cacheTimeoutMs{1000}; // timeout on bytes writing bool enforceCacheSize{false}; // drop output frames if cache is full bool mergeAudioMessages{false}; // combine collocated audio messages together std::string tlsCertFile; std::string tlsKeyFile; // Skip packets that fail with EPERM errors and continue decoding. bool skipOperationNotPermittedPackets{false}; // probing size in bytes, i.e. the size of the data to analyze to get stream // information. A higher value will enable detecting more information in case // it is dispersed into the stream, but will increase latency. Must be an // integer not lesser than 32. It is 5000000 by default. int64_t probeSize{5000000}; }; struct DecoderHeader { // message id, from 0 till ... size_t seqno{0}; // decoded timestamp in microseconds from either beginning of the stream or // from epoch time, see DecoderParameters::convertPtsToWallTime long pts{0}; // decoded key frame size_t keyFrame{0}; // frames per second, valid only for video streams double fps{0}; // format specifies what kind frame is in a payload MediaFormat format; }; // Abstract interface ByteStorage class class ByteStorage { public: virtual ~ByteStorage() = default; // makes sure that buffer has at least n bytes available for writing, if not // storage must reallocate memory. virtual void ensure(size_t n) = 0; // caller must not to write more than available bytes virtual uint8_t* writableTail() = 0; // caller confirms that n bytes were written to the writable tail virtual void append(size_t n) = 0; // caller confirms that n bytes were read from the read buffer virtual void trim(size_t n) = 0; // gives an access to the beginning of the read buffer virtual const uint8_t* data() const = 0; // returns the stored size in bytes virtual size_t length() const = 0; // returns available capacity for writable tail virtual size_t tail() const = 0; // clears content, keeps capacity virtual void clear() = 0; }; struct DecoderOutputMessage { DecoderHeader header; std::unique_ptr payload; }; /* * External provider of the ecnoded bytes, specific implementation is left for * different use cases, like file, memory, external network end-points, etc. * Normally input/output parameter @out set to valid, not null buffer pointer, * which indicates "read" call, however there are "seek" modes as well. * @out != nullptr => read from the current offset, @whence got ignored, * @size bytes to read => return number bytes got read, 0 if no more bytes * available, < 0 on error. * @out == nullptr, @timeoutMs == 0 => does provider support "seek" * capability in a first place? @size & @whence got ignored, return 0 on * success, < 0 if "seek" mode is not supported. * @out == nullptr, @timeoutMs != 0 => normal seek call * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END], * length of buffer if @whence = [AVSEEK_SIZE]. */ using DecoderInCallback = std::function; using DecoderOutCallback = std::function; struct DecoderMetadata { // time base numerator long num{0}; // time base denominator long den{1}; // duration of the stream, in miscroseconds, if available long duration{-1}; // frames per second, valid only for video streams double fps{0}; // format specifies what kind frame is in a payload MediaFormat format; }; /** * Abstract class for decoding media bytes * It has two different modes. Internal media bytes retrieval for given uri and * external media bytes provider in case of memory streams */ class MediaDecoder { public: virtual ~MediaDecoder() = default; /** * Initializes media decoder with parameters, * calls callback when media bytes are available. * Media bytes get fetched internally from provided URI * or invokes provided input callback to get media bytes. * Input callback must be empty for the internal media provider * Caller can provide non-null pointer for the input container * if headers to obtain the streams metadata (optional) */ virtual bool init( const DecoderParameters& params, DecoderInCallback&& in, std::vector* metadata) = 0; /** * Polls available decoded one frame from decoder * Returns error code, 0 - for success */ virtual int decode(DecoderOutputMessage* out, uint64_t timeoutMs) = 0; /** * Polls available decoded bytes from decoder, till EOF or error */ virtual int decode_all(const DecoderOutCallback& callback) = 0; /** * Stops calling callback, releases resources */ virtual void shutdown() = 0; /** * Interrupts whatever decoder is doing at any time */ virtual void interrupt() = 0; /** * Factory to create ByteStorage class instances, particular implementation is * left to the derived class. Caller provides the initially allocated size */ virtual std::unique_ptr createByteStorage(size_t n) = 0; }; struct SamplerParameters { MediaType type{TYPE_AUDIO}; FormatUnion in; FormatUnion out; int64_t loggingUuid{0}; }; /** * Abstract class for sampling media bytes */ class MediaSampler { public: virtual ~MediaSampler() = default; /** * Initializes media sampler with parameters */ virtual bool init(const SamplerParameters& params) = 0; /** * Samples media bytes * Returns error code < 0, or >=0 - for success, indicating number of bytes * processed. * set @in to null for flushing data */ virtual int sample(const ByteStorage* in, ByteStorage* out) = 0; /** * Releases resources */ virtual void shutdown() = 0; /* * Returns media type */ MediaType getMediaType() const { return params_.type; } /* * Returns formats */ FormatUnion getInputFormat() const { return params_.in; } FormatUnion getOutFormat() const { return params_.out; } protected: SamplerParameters params_; }; } // namespace ffmpeg