diff options
-rw-r--r-- | DOCS/tech/mncf.txt | 661 |
1 files changed, 661 insertions, 0 deletions
diff --git a/DOCS/tech/mncf.txt b/DOCS/tech/mncf.txt new file mode 100644 index 0000000000..5afb1045be --- /dev/null +++ b/DOCS/tech/mncf.txt @@ -0,0 +1,661 @@ + NUT Open Container Format DRAFT 20041005 (Michael's experimental fork) + ---------------------------------------- + + + + Intro: + +Features / goals: + (supported by the format, not necessary by a specific implementation) + +Simple + use the same encoding for nearly all fields + simple decoding, so slow cpus (and embedded systems) can handle it +Extendible + no limit for the possible values for all fields (using universal vlc) + allow adding of new headers in the future + allow adding more fields at the end of headers +Compact + ~0.2% overhead, for normal bitrates + index is <10kb per hour (1 keyframe every 3sec) + a usual header for a file is about 100bytes (audio + video headers together) + a packet header is about ~1-8 bytes +Error resistant + seeking / playback without an index + headers & index can be repeated + damaged files can be played back with minimal data lost and fast + resyncing times + + + + Definitions: + +MUST the specific part must be done to conform to this standard +SHOULD its recommanded to be done that way but its not strictly required + + + + Syntax: + + Type definitions: + +f(x) n fixed bits in big-endian order +u(x) unsigned number encoded in x bits in MSB first order + +v + value=0 + do{ + more_data u(1) + data u(7) + value= 128*value + data + }while(more_data) + +s + temp v + temp++ + if(temp&1) value= -(temp>>1) + else value= (temp>>1) + +b (binary data or string) + for(i=0; i<length; i++){ + data[i] u(8) + } + Note: strings MUST be encoded in utf8 + +vb + length v + value b + + + Bitstream syntax: +packet header + forward ptr v + +align_byte + while(not byte aligned) + one f(1) + +reserved_bytes + for(i=0; i<forward_ptr - length_of_non_reserved; i++) + reserved u(8) + a demuxer MUST ignore any reserved bytes + a muxer MUST NOT write any reserved bytes, as this would make it + inpossible to add new fields at the end of packets in the future in + a compatible way + +main header: + main_startcode f(64) + packet header + version v + stream_count v + max_distance v + max_short_distance v + global_time_base_nom v + global_time_base_denom v + short_startcode v + for(i=0; i<256; ){ + tmp_flag v + tmp_fields v + if(tmp_fields>0) tmp_timestamp s + if(tmp_fields>1) tmp_mul v + if(tmp_fields>2) tmp_stream v + if(tmp_fields>3) tmp_size v + else tmp_size=0 + if(tmp_fields>4) tmp_res v + else tmp_res=0 + if(tmp_fields>5) count v + else count= tmp_mul - tmp_size + for(j=6; j<tmp_fields; j++){ + tmp_reserved[i] v + } + for(j=0; j<count && i<256; j++, i++){ + flags[i]= tmp_flag; + stream_id_plus1[i]= tmp_stream; + data_size_mul[i]= tmp_mul; + data_size_lsb[i]= tmp_size + j; + timestamp_delta[i]= tmp_timestamp; + reserved_count[i]= tmp_res; + } + } + reserved_bytes + checksum u(32) + +stream_header: + stream_startcode f(64) + packet_header + stream_id v + stream_class v + fourcc vb + average_bitrate v + time_base_nom v + time_base_denom v + msb_timestamp_shift v + decode_delay v + fixed_fps u(1) + reserved u(6) + codec_specific_data vb + +video_stream_header: + stream_header + width v + height v + sample_width v + sample_height v + colorspace_type v + reserved_bytes + checksum u(32) + +audio_stream_header: + stream_header + samplerate_nom v + samplerate_denom v + channel_count v + reserved_bytes + checksum u(32) + + +frame + frame_code f(8) + if(stream_id_plus1[frame_code]==0){ + stream_id v + } + if(timestamp_delta[frame_code]==0){ + coded_timestamp v + } + if(flags[frame_code]&1){ + data_size_msb v + } + for(i=0; i<reserved_count[frame_code]; i++) + reserved v + data + +Index: + index_startcode f(64) + packet header + index_length v + for(i=0; i<index_length; i++){ + index_timestamp v + index_position v + } + reserved_bytes + checksum u(32) + +info_packet: (optional) + info_startcode f(64) + packet header + for(;;){ + id v + if(id==0) break + name= info_table[id][0] + type= info_table[id][1] + if(type==NULL) + type vb + if(name==NULL) + name vb + if(type=="v") + value v + else + value vb + } + reserved_bytes + checksum u(32) + +sync_point + frame_startcode f(64) + global_timestamp v + +file + file_id_string + while(!eof && next_code != index_startcode){ + main_header + for(i=0; i<stream_count; i++){ + if(next_packet==video_stream_header) + video_stream_header + else + audio_stream_header + } + while(next_code != main_startcode){ + if(next_code == info_startcode) + info_packet + else{ + if(next_code == short_startcode) + short_startcode u(24) + else if(next_code == frame_startcode) + sync_point + frame + } + } + } + index + +forward_ptr + size of the packet (exactly the distance from the first byte of the + startcode of the current packet to the first byte of the following packet + +file_id_string + "nut/multimedia container\0" + +*_startcode + all startcodes start with 'N' + +main_startcode + 0x7A561F5F04ADULL + (((uint64_t)('N'<<8) + 'M')<<48) +stream_starcode + 0x11405BF2F9DBULL + (((uint64_t)('N'<<8) + 'S')<<48) +frame_startcode + 0xE4ADEECA4569ULL + (((uint64_t)('N'<<8) + 'K')<<48) + frame_startcodes SHOULD be placed immedeatly before a keyframe if the + previous frame of the same stream was a non-keyframe, unless such + non-keyframe - keyframe tansitions are very frequent + +index_startcode + 0xDD672F23E64EULL + (((uint64_t)('N'<<8) + 'X')<<48) +info_startcode + 0xAB68B596BA78ULL + (((uint64_t)('N'<<8) + 'I')<<48) + +version + 2 for now + +max_distance + max distance of frame_startcodes, the distance may only be larger if + there is only a single frame between the 2 frame_startcodes + this can be used by the demuxer to detect damaged frame headers if the + damage results in a too long chain + SHOULD be set to <=32768 or at least <=65536 unless there is a very good + reason to set it higher otherwise reasonable error recovery will be + impossible + +max_short_distance + max distance of short startcodes or frame_startcodes, the distance may + only be larger if there is only a single frame between the 2 + frame_startcodes/short startcodes this can be used by the demuxer to + detect damaged frame headers if the damage results in a too long chain + SHOULD be set to <=4096 or at least <=8192 unless there is a very good + reason to set it higher otherwise reasonable error recovery will be + impossible + + +short_startcode + MUST be 3 bytes long and MUST have 'N' as first byte, the second byte + MUST not be a printable uppercase letter / must not be within 65..90, + default is 0x4EFE79 + +stream_id + Note: streams with a lower relative class MUST have a lower relative id + so a stream with class 0 MUST allways have a id which is lower then any + stream with class > 0 + stream_id MUST be < stream_count + +stream_class + 0 video + 32 audio + 64 subtiles + Note the remaining values are reserved and MUST NOT be used + a demuxer MUST ignore streams with reserved classes + +fourcc + identification for the codec + example: "H264" + MUST contain 2 or 4 bytes, note, this might be increased in the future + if needed + +time_base_nom / time_base_denom = time_base + the number of timer ticks per second, this MUST be equal to the fps + if the fixed_fps is 1 + time_base_denom MUST not be 0 + time_base_nom and time_base_denom MUST be relative prime + time_base_nom MUST be < 2^31 + examples: + fps time_base_nom time_base_denom + 30 30 1 + 29.97 30000 1001 + 23.976 24000 1001 + sample_rate sample_rate_mul time_base_nom time_base_denom + 44100 1 44100 1 + 44100 64 11025 16 + 48000 1024 375 8 + Note: the advantage to using a large sample_rate_mul is that the + timestamps need fewer bits + +global_time_base_nom / global_time_base_denom = global_time_base + the number of timer ticks per second + global_time_base_denom MUST not be 0 + global_time_base_nom and global_time_base_denom MUST be relative prime + global_time_base_nom MUST be < 2^31 + +global_timestamp + timestamp in global_time_base units + when a global_timestamp is encountered the last_timestamp of all streams + is set to the following: + ln= global_time_base_denom*time_base_nom + sn= global_timestamp + d1= global_time_base_nom + d2= time_base_denom + last_timestamp= (ln/d1*sn + ln%d1*sn/d1)/d2 + Note, this calculation MUST be done with unsigned 64 bit integers, and + is equivalent to (ln*sn)/(d1*d2) but this would require a 96bit integer + +msb_timestamp_shift + amount of bits in lsb_timestamp + MUST be <16 + +decode_delay + maximum time between input and output for a codec, used to generate dts + from pts + is 0 for streams without b frames, and 1 for streams with b frames, may + be larger for future codecs + +fixed_fps + 1 indicates that the fps is fixed + +codec_specific_data + private global data for a codec (could be huffman tables or ...) + +frame_code + the meaning of this byte is stored in the main header + the value 78 ('N') is forbidden to ensure that the byte is always + different from the first byte of any startcode + +flags[frame_code] + the bits of the flags from MSB to LSB are KD + if D is 1 then data_size_msb is coded, otherwise data_size_msb is 0 + K is the keyframe_type + 0-> no keyframe, + 1-> keyframe, + flags=4 can be used to mark illegal frame_code bytes + frame_code=78 must have flags=4 + * frames MUST not depend(1) upon frames prior to the last + frame_startcode + depend(1) means dependancy on the container level (NUT) not dependancy + on the codec level + +stream_id_plus1[frame_code] + must be <250 + if its 0 then the stream_id is coded in the frame + +data_size_mul[frame_code] + must be <16384 + +data_size_lsb[frame_code] + must be <16384 + +timestamp_delta[frame_code] + must be <16384 and >-16384 + +data_size + data_size= data_size_lsb + data_size_msb*data_size_mul; + +coded_timestamp + if coded_timestamp < (1<<msb_timestamp_shift) then its a + lsb timestamp, otherwise its a full timestamp + (1<<msb_timestamp_shift) + lsb timestamps are converted to full timesamps by: + mask = (1<<msb_timestamp_shift)-1; + delta= last_timestamp - mask/2 + timestamp= ((timestamp_lsb-delta)&mask) + delta + a full timestamp must be used if there is no reference timestamp + available after the last frame_startcode with the current stream_id + +lsb_timestamp + least significant bits of the timestamp in time_base precission + Example: IBBP display order + keyframe timestamp=0 -> timestamp=0 + frame lsb_timestamp=3 -> timestamp=3 + frame lsb_timestamp=1 -> timestamp=1 + frame lsb_timestamp=2 -> timestamp=2 + ... + keyframe msb_timestamp=257 -> timestamp=257 + frame lsb_timestamp=255->timestamp=255 + frame lsb_timestamp=0 -> timestamp=256 + frame lsb_timestamp=4 -> timestamp=260 + frame lsb_timestamp=2 -> timestamp=258 + frame lsb_timestamp=3 -> timestamp=259 + all timestamps of keyframes of a single stream MUST be monotone + +dts + dts are calculated by using a decode_delay+1 sized buffer for each + stream, into which the current pts is inserted and the element with + the smallest value is removed, this is then the current dts + this buffer is initalized with decode_delay -1 elements + all frames with dts == timestamp must be monotone, that means a frame + which occures later in the stream must have a larger or equal dts + then an earlier frame + FIXME rename timestamp* to pts* ? + +width/height + MUST be set to the coded width/height + +sample_width/sample_height (aspect ratio) + sample_width is the horizontal distance between samples + sample_width and sample_height MUST be relative prime if not zero + MUST be 0 if unknown + +colorspace_type + 0 unknown + 1 ITU Rec 624 / ITU Rec 601 Y range: 16..235 Cb/Cr range: 16..240 + 2 ITU Rec 709 Y range: 16..235 Cb/Cr range: 16..240 + 17 ITU Rec 624 / ITU Rec 601 Y range: 0..255 Cb/Cr range: 0..255 + 18 ITU Rec 709 Y range: 0..255 Cb/Cr range: 0..255 + +samplerate_nom / samplerate_denom = samplerate + the number of samples per second + +checksum + adler32 checksum + +index_timestamp + value of the timetamp in a sync point relative to the last sync-point + +index_position + position in bytes of the first byte of a sync-point, relative to the + last sync_point + +id + the id of the type/name pair, so its more compact + 0 means end + +type + for example: "UTF8" -> String or "JPEG" -> jpeg image + Note: nonstandard fields should be prefixed by "X-" + Note: MUST be less than 6 byte long (might be increased to 64 later) + +name + the name of the info entry, valid names are + "TotalTime" total length of the stream in msecs + "StreamId" the stream(s) to which the info packet applies + "StartTimestamp" + "EndTimestamp" the time range in msecs to which the info applies + "SegmentId" a unique id for the streams + time specified + "Author" + "Description" + "Copyright" + "Encoder" the name & version of the software used for encoding + "Title" + "Cover" an image of the (cd,dvd,vhs,..) cover (preferable PNG or JPEG) + "Source" "DVD", "VCD", "CD", "MD", "FM radio", "VHS", "TV", + "LD" + Optional: appended PAL,NTSC,SECAM, ... in parentheses + "CaptureDevice" "BT878", "BT848", "webcam", ... (more exact names are fine too) + "CreationTime" "2003-01-20 20:13:15Z", ... + (ISO 8601 format, see http://www.cl.cam.ac.uk/~mgk25/iso-time.html) + Note: dont forget the timezone + "ReplayGain" + "Keywords" + "Language" ISO 639 and ISO 3166 for language/country code + something like "usen" (US english), can be 0 if unknown + and "multi" if several languages + see http://www.loc.gov/standards/iso639-2/englangn.html + and http://www.din.de/gremien/nas/nabd/iso3166ma/codlstp1/en_listp1.htmlthe language code + "Disposition" "original", "dub" (translated), "comment", "lyrics", "karaoke" + Note: if someone needs some others, please tell us about them, so we can + add them to the official standard (if they are sane) + Note: nonstandard fields should be prefixed by "X-" + Note: MUST be less than 64 bytes long + +value + value of this name/type pair + +stuffing + 0x80 can be placed infront of any type v entry for stuffing + purposes + +info_table[][2]={ + {NULL , NULL }, // end + {NULL , NULL }, + {NULL , "UTF8"}, + {NULL , "v"}, + {NULL , "s"}, + {"StreamId" , "v"}, + {"SegmentId" , "v"}, + {"StartTimestamp" , "v"}, + {"EndTimestamp" , "v"}, + {"Author" , "UTF8"}, + {"Titel" , "UTF8"}, + {"Language" , "UTF8"}, + {"Description" , "UTF8"}, + {"Copyright" , "UTF8"}, + {"Encoder" , "UTF8"}, + {"Keyword" , "UTF8"}, + {"Cover" , "JPEG"}, + {"Cover" , "PNG"}, + {"Disposition" , "UTF8"}, +}; + + Structure: + +the headers MUST be in exactly the following order (to simplify demuxer design) +main header +stream_header (id=0) +stream_header (id=1) +... +stream_header (id=n) + +headers may be repated, but if they are then they MUST all be repeated together +and repeated headers MUST be identical +headers MAY only repeated at the closest possible positions after 2^x where x is +an integer and the file end, so the headers may be repeated at 4102 if thats the +closest possition after 2^12=4096 at which the headers can be placed + +headers MUST be placed at least at the begin of the file and immedeatly before +the index or at the file end if there is no index +headers MUST be repeated at least twice (so they exist 3 times in a file) + +a demuxer MUST not demux a stream which contains more than one stream, or which +is wrapped in a structure to facilitate more than one stream or otherwise +duplicate the role of a container. any such file is to be considered invalid + +info packets which describe the whole file or individual streams/tracks must be +placed before any video/audio/... frames + + Index +every sync-point must be exacty once in the index +Note: in case of realtime streaming there is no end, so no index there either + + Info packets +the info_packet can be repeated, it can also contain different names & values +each time but only if allso the time is different +Info packets can be used to describe the file or some part of it (chapters) + +info packets, SHOULD be placed at the begin of the file at least +for realtime streaming info packets will normally be transmitted when they apply +for example, the current song title & artist of the currently shown music video + + Unknown packets +MUST be ignored by the demuxer + + demuxer (non-normative) + +in the absence of valid header at beginning, players SHOULD search for backup +headers starting at offset 2^x for each x players SHOULD end their search from a +particular offset when any startcode is found (including syncpoint) + + + Sample code (GPL, & untested) + +typedef BufferContext{ + uint8_t *buf; + uint8_t *buf_ptr; +}BufferContext; + +static inline uint64_t get_bytes(BufferContext *bc, int count){ + uint64_t val=0; + + assert(count>0 && count<9) + + for(i=0; i<count; i++){ + val <<=8; + val += *(bc->buf_ptr++); + } + + return val; +} + +static inline void put_bytes(BufferContext *bc, int count, uint64_t val){ + uint64_t val=0; + + assert(count>0 && count<9) + + for(i=count-1; i>=0; i--){ + *(bc->buf_ptr++)= val >> (8*i); + } + + return val; +} + +static inline uint64_t get_v(BufferContext *bc){ + uint64_t val= 0; + + for(; space_left(bc) > 0; ){ + int tmp= *(bc->buf_ptr++); + if(tmp&0x80) + val= (val<<7) + tmp - 0x80; + else + return (val<<7) + tmp; + } + + return -1; +} + +static inline int put_v(BufferContext *bc, uint64_t val){ + int i; + + if(space_left(bc) < 9) return -1; + + val &= 0x7FFFFFFFFFFFFFFFULL; // FIXME can only encode upto 63 bits currently + for(i=7; ; i+=7){ + if(val>>i == 0) break; + } + + for(i-=7; i>0; i-=7){ + *(bc->buf_ptr++)= 0x80 | (val>>i); + } + *(bc->buf_ptr++)= val&0x7F; + + return 0; +} + +static int64_t get_dts(int64_t pts, int64_t *pts_cache, int delay, int reset){ + if(reset) memset(pts_cache, -1, delay*sizeof(int64_t)); + + while(delay--){ + int64_t t= pts_cache[delay]; + if(t < pts){ + pts_cache[delay]= pts; + pts= t; + } + } + + return pts; +} + + Authors + +Folks from MPlayer Developers Mailinglist (http://www.mplayehrq.hu/). +Authors in ABC-order: (FIXME! Tell us if we left you out) + Beregszaszi, Alex (alex@fsn.hu) + Bunkus, Moritz (moritz@bunkus.org) + Diedrich, Tobias (td@sim.uni-hannover.de) + Felker, Rich (dalias@aerifal.cx) + Franz, Fabian (FabianFranz@gmx.de) + Gereoffy, Arpad (arpi@thot.banki.hu) + Hess, Andreas (jaska@gmx.net) + Niedermayer, Michael (michaelni@gmx.at) |