aboutsummaryrefslogtreecommitdiffhomepage
path: root/TOOLS
diff options
context:
space:
mode:
authorGravatar wm4 <wm4@nowhere>2016-12-17 13:24:05 +0100
committerGravatar wm4 <wm4@nowhere>2016-12-17 15:43:15 +0100
commitff9f5e06ff203c055d968087956026ef9204218b (patch)
tree883725191398913cc97bd441b7110bcf68237e28 /TOOLS
parent2b8b17402ed59815019b309051b277ba4de82f3b (diff)
Revert "Port several python scripts to Perl"
This reverts commit fae73079310eef9dce9737f2e37ff4b80c8830ee. Before the waf build system was used, we had a configure script written in shell. To drop the build dependency on Python, someone rewrote the Python scripts we had to Perl. Now the shell configure script is gone, and it makes no sense to have a build dependency on both Perl and Python. This isn't just a straight revert. It adds the new Matroska EBML elements to the old Python scripts, adjusts the waf build system, and of course doesn't add anything back needed by the old build system. It would be better if this used matroska.py/file2string.py directly by importing them as modules, instead of calling them via "python". But for now this is simpler.
Diffstat (limited to 'TOOLS')
-rwxr-xr-xTOOLS/file2string.pl24
-rwxr-xr-xTOOLS/file2string.py27
-rw-r--r--TOOLS/lib/Parse/Matroska.pm30
-rw-r--r--TOOLS/lib/Parse/Matroska/Definitions.pm384
-rw-r--r--TOOLS/lib/Parse/Matroska/Element.pm331
-rw-r--r--TOOLS/lib/Parse/Matroska/Reader.pm426
-rw-r--r--TOOLS/lib/Parse/Matroska/Utils.pm37
-rwxr-xr-xTOOLS/matroska.pl169
-rwxr-xr-xTOOLS/matroska.py463
9 files changed, 490 insertions, 1401 deletions
diff --git a/TOOLS/file2string.pl b/TOOLS/file2string.pl
deleted file mode 100755
index 341bb06fd6..0000000000
--- a/TOOLS/file2string.pl
+++ /dev/null
@@ -1,24 +0,0 @@
-#! /usr/bin/env perl
-
-use strict;
-use warnings;
-
-# Convert the contents of a file into a C string constant.
-# Note that the compiler will implicitly add an extra 0 byte at the end
-# of every string, so code using the string may need to remove that to get
-# the exact contents of the original file.
-# FIXME: why not a char array?
-
-# treat only alphanumeric and punctuations (excluding " and ?) as safe
-my $unsafe_chars = qr{[^][A-Za-z0-9!#%&'()*+,./:;<=>^_{|}~ -]};
-
-for my $file (@ARGV) {
- open my $fh, '<:raw', $file or next;
- print "/* Generated from $file */\n";
- while (<$fh>) {
- # replace unsafe chars with their equivalent octal escapes
- s/($unsafe_chars)/\\@{[sprintf '%03o', ord($1)]}/gos;
- print "\"$_\"\n"
- }
- close $fh;
-}
diff --git a/TOOLS/file2string.py b/TOOLS/file2string.py
new file mode 100755
index 0000000000..6cdd1a72ae
--- /dev/null
+++ b/TOOLS/file2string.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+# Convert the contents of a file into a C string constant.
+# Note that the compiler will implicitly add an extra 0 byte at the end
+# of every string, so code using the string may need to remove that to get
+# the exact contents of the original file.
+
+import sys
+
+# Indexing a byte string yields int on Python 3.x, and a str on Python 2.x
+def pord(c):
+ return ord(c) if type(c) == str else c
+
+def main(infile):
+ conv = ['\\' + ("%03o" % c) for c in range(256)]
+ safe_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \
+ "0123456789!#%&'()*+,-./:;<=>?[]^_{|}~ "
+ for c in safe_chars:
+ conv[ord(c)] = c
+ for c, esc in ("\nn", "\tt", r"\\", '""'):
+ conv[ord(c)] = '\\' + esc
+ for line in infile:
+ sys.stdout.write('"' + ''.join(conv[pord(c)] for c in line) + '"\n')
+
+with open(sys.argv[1], 'rb') as infile:
+ sys.stdout.write("// Generated from %s\n\n" % sys.argv[1])
+ main(infile)
diff --git a/TOOLS/lib/Parse/Matroska.pm b/TOOLS/lib/Parse/Matroska.pm
deleted file mode 100644
index e1c08c9814..0000000000
--- a/TOOLS/lib/Parse/Matroska.pm
+++ /dev/null
@@ -1,30 +0,0 @@
-use 5.008;
-use strict;
-use warnings;
-
-# ABSTRACT: Module collection to parse Matroska files.
-package Parse::Matroska;
-
-=head1 DESCRIPTION
-
-C<use>s L<Parse::Matroska::Reader>. See the documentation
-of the modules mentioned in L</"SEE ALSO"> for more information
-in how to use this module.
-
-It's intended for this module to contain high-level interfaces
-to the other modules in the distribution.
-
-=head1 SOURCE CODE
-
-L<https://github.com/Kovensky/Parse-Matroska>
-
-=head1 SEE ALSO
-
-L<Parse::Matroska::Reader>, L<Parse::Matroska::Element>,
-L<Parse::Matroska::Definitions>.
-
-=cut
-
-use Parse::Matroska::Reader;
-
-1;
diff --git a/TOOLS/lib/Parse/Matroska/Definitions.pm b/TOOLS/lib/Parse/Matroska/Definitions.pm
deleted file mode 100644
index 5a5adcd6de..0000000000
--- a/TOOLS/lib/Parse/Matroska/Definitions.pm
+++ /dev/null
@@ -1,384 +0,0 @@
-use 5.008;
-use strict;
-use warnings;
-
-# ABSTRACT: internal EBML grammar definitions
-package Parse::Matroska::Definitions;
-
-use Parse::Matroska::Utils qw{uniq uncamelize};
-
-use Exporter;
-our @ISA = qw{Exporter};
-our @EXPORT_OK = qw{elem_by_hexid %EBML_DEFINITION %MATROSKA_DEFINITION};
-
-=head1 SYNOPSIS
-
- use Parse::Matroska::Definitions qw{elem_by_hexid};
- my $ebml_id = elem_by_hexid('1a45dfa3');
- print "EBML ID $ebml_id->{elid}'s name: $ebml_id->{name}";
-
-=head1 DESCRIPTION
-
-Contains the definition of the EBML grammar as expected in
-Matroska files. This module is meant mostly for internal use.
-
-As this was extended from a script in mpv-player, some data
-generated is apparently useless for regular module users
-but is still relevant to the mpv-player script. Such data
-is annotated as being for mpv compatibility.
-
-=head1 NOTE
-
-The API of this module is not yet considered stable.
-
-=head1 GLOBALS
-
-These global variables are considered B<immutable>.
-
-=head2 @Parse::Matroska::Definitions::global_elem_list
-
-A global list of known matroska elements. Useful for
-mpv's matroska script, used for generating C headers
-that parse matroska.
-
-=head2 %Parse::Matroska::Definitions::global_elem_dict
-
-A global hash of known matroska elements. Used internally
-by L</elem_by_hexid($id)>.
-
-=cut
-
-@Parse::Matroska::Definitions::global_elem_list = ();
-%Parse::Matroska::Definitions::global_elem_dict = ();
-
-=head2 %EBML_DEFINITION
-
-Optionally-importable hash of known EBML IDs belonging
-to the EBML generic grammar.
-
-=head2 %MATROSKA_DEFINITION
-
-Optionally-importable hash of known EBML IDs belonging
-to the Matroska-specific grammar.
-
-=cut
-
-our %EBML_DEFINITION = define_ebml();
-our %MATROSKA_DEFINITION = define_matroska();
-
-=method elem_by_hexid($id)
-
-Returns an EBML Element Definition corresponding to the provided
-hexadecimal string. Returns C<undef> if the element is unknown.
-
-=cut
-sub elem_by_hexid {
- my ($elid) = @_;
- return $Parse::Matroska::Definitions::global_elem_dict{$elid};
-}
-
-################################################
-### Helper functions for document definition ###
-################################################
-
-# used by elem when setting the 'valname' key
-use constant TYPE_MAP => {
- uint => 'uint64_t',
- str => 'char *',
- binary => 'struct bstr',
- ebml_id => 'uint32_t',
- float => 'double',
- sint => 'int64_t',
-};
-
-# this will be localized to "MATROSKA" or "EBML" on the elem declarations
-our $ELEM_DEFINE_TYPE = undef;
-
-=method elem($name,$elid,$valtype)
-
-NOTE: never call this function yourself; it changes data structures
-that are considered immutable outside of this package.
-
-Internal API function that generates the EBML Element Definitions.
-
-This API function returns an array which first element is C<$elid>
-and the second is a generated hash. The generated hash is stored
-in the @global_elem_list and %global_elem_dict.
-
-The generated hash contains:
-
-=for :list
-= name
-The EBML Element's name, given through C<$name>.
-= elid
-The EBML Element's hex id, given through C<$elid>. Used for lookups by L</elem_by_hexid($id)>.
-= valtype
-The EBML Element's type, given through C<$valtype>, except when C<$valtype> is an arrayref.
-= multiple
-If C<$name> ends with a C<*>, this is set as true and strips the C<*> from L</name>. Used to
-mark elements that may be repeated.
-= subelements
-An arrayref of elements that may be children of this element, given through C<$valtype> if it
-is an arrayref. Sets L</valtype> to C<sub> if there are subelements.
-= subids
-An arrayref listing all the L</elid>s of subelements, C<uniq>ified.
-
-The following elements are for mpv compatibility:
-
-=for :list
-= definename
-Name used for generating C #defines.
-= fieldname
-Name used for generating C struct fields.
-= structname
-Name used for generating C struct names.
-= ebmltype
-A pre-#defined constant to describe the element's type.
-= valname
-Typename used when declaring a struct field referring to this element.
-
-=cut
-sub elem {
- my %e = (name => shift, elid => shift, valtype => shift);
-
- # strip * from name, set 'multiple' if there was one
- $e{multiple} = scalar $e{name} =~ s/\*$//;
-
- # ELEM_DEFINE_TYPE is either MATROSKA or EBML
- $e{definename} = "${ELEM_DEFINE_TYPE}_ID_".uc($e{name});
- $e{fieldname} = uncamelize $e{name};
- $e{structname} = "ebml_$e{fieldname}";
-
- if (ref $e{valtype} eq 'HASH') {
- $e{subelements} = $e{valtype};
- $e{subids} = uniq map { $_->{elid} } values %{$e{subelements}};
- $e{valtype} = 'sub';
- $e{ebmltype} = 'EBML_TYPE_SUBELEMENTS';
- $e{valname} = "struct $e{structname}";
- } else {
- $e{ebmltype} = "EBML_TYPE_\U$e{valtype}";
- die "Unrecognized value type $e{valtype}" unless
- defined ($e{valname} = TYPE_MAP->{$e{valtype}});
- }
- my $e = \%e;
- push @Parse::Matroska::Definitions::global_elem_list, $e;
- $Parse::Matroska::Definitions::global_elem_dict{$e{elid}} = $e;
- return ($e{elid}, $e);
-}
-
-#############################################
-### EBML and Matroska document definitons ###
-#############################################
-
-=method define_ebml
-
-Internal function that defines the EBML generic grammar.
-
-Must not be called from outside the package.
-
-=cut
-sub define_ebml {
- local $ELEM_DEFINE_TYPE = 'EBML';
- return (
- elem('EBML', '1a45dfa3', {
- elem('EBMLVersion', '4286', 'uint'),
- elem('EBMLReadVersion', '42f7', 'uint'),
- elem('EBMLMaxIDLength', '42f2', 'uint'),
- elem('EBMLMaxSizeLength', '42f3', 'uint'),
- elem('DocType', '4282', 'str'),
- elem('DocTypeVersion', '4287', 'uint'),
- elem('DocTypeReadVersion', '4285', 'uint'),
- }),
-
- elem('CRC32', 'bf', 'binary'),
- elem('Void', 'ec', 'binary'),
- );
-}
-
-
-=method define_matroska
-
-Internal function that defines the Matroska-specific EBML grammar.
-
-Must not be called from outside the package.
-
-=cut
-sub define_matroska {
- local $ELEM_DEFINE_TYPE = 'MATROSKA';
- return (
- elem('Segment', '18538067', {
- elem('SeekHead*', '114d9b74', {
- elem('Seek*', '4dbb', {
- elem('SeekID', '53ab', 'ebml_id'),
- elem('SeekPosition', '53ac', 'uint'),
- }),
- }),
-
- elem('Info*', '1549a966', {
- elem('SegmentUID', '73a4', 'binary'),
- elem('PrevUID', '3cb923', 'binary'),
- elem('NextUID', '3eb923', 'binary'),
- elem('TimecodeScale', '2ad7b1', 'uint'),
- elem('DateUTC', '4461', 'sint'),
- elem('Title', '7ba9', 'str'),
- elem('MuxingApp', '4d80', 'str'),
- elem('WritingApp', '5741', 'str'),
- elem('Duration', '4489', 'float'),
- }),
-
- elem('Cluster*', '1f43b675', {
- elem('Timecode', 'e7', 'uint'),
- elem('BlockGroup*', 'a0', {
- elem('Block', 'a1', 'binary'),
- elem('BlockDuration', '9b', 'uint'),
- elem('ReferenceBlock*', 'fb', 'sint'),
- elem('DiscardPadding', '75A2', 'sint'),
- }),
- elem('SimpleBlock*', 'a3', 'binary'),
- }),
-
- elem('Tracks*', '1654ae6b', {
- elem('TrackEntry*', 'ae', {
- elem('TrackNumber', 'd7', 'uint'),
- elem('TrackUID', '73c5', 'uint'),
- elem('TrackType', '83', 'uint'),
- elem('FlagEnabled', 'b9', 'uint'),
- elem('FlagDefault', '88', 'uint'),
- elem('FlagForced', '55aa', 'uint'),
- elem('FlagLacing', '9c', 'uint'),
- elem('MinCache', '6de7', 'uint'),
- elem('MaxCache', '6df8', 'uint'),
- elem('DefaultDuration', '23e383', 'uint'),
- elem('TrackTimecodeScale', '23314f', 'float'),
- elem('MaxBlockAdditionID', '55ee', 'uint'),
- elem('Name', '536e', 'str'),
- elem('Language', '22b59c', 'str'),
- elem('CodecID', '86', 'str'),
- elem('CodecPrivate', '63a2', 'binary'),
- elem('CodecName', '258688', 'str'),
- elem('CodecDecodeAll', 'aa', 'uint'),
- elem('CodecDelay', '56AA', 'uint'),
- elem('SeekPreRoll', '56BB', 'uint'),
- elem('Video', 'e0', {
- elem('FlagInterlaced', '9a', 'uint'),
- elem('PixelWidth', 'b0', 'uint'),
- elem('PixelHeight', 'ba', 'uint'),
- elem('DisplayWidth', '54b0', 'uint'),
- elem('DisplayHeight', '54ba', 'uint'),
- elem('DisplayUnit', '54b2', 'uint'),
- elem('FrameRate', '2383e3', 'float'),
- elem('ColourSpace', '2eb524', 'binary'),
- elem('StereoMode', '53b8', 'uint'),
- elem('Colour', '55B0', {
- elem('MatrixCoefficients', '55B1', 'uint'),
- elem('BitsPerChannel', '55B2', 'uint'),
- elem('ChromaSubsamplingHorz', '55B3', 'uint'),
- elem('ChromaSubsamplingVert', '55B4', 'uint'),
- elem('CbSubsamplingHorz', '55B5', 'uint'),
- elem('CbSubsamplingVert', '55B6', 'uint'),
- elem('ChromaSitingHorz', '55B7', 'uint'),
- elem('ChromaSitingVert', '55B8', 'uint'),
- elem('Range', '55B9', 'uint'),
- elem('TransferCharacteristics', '55BA', 'uint'),
- elem('Primaries', '55BB', 'uint'),
- elem('MaxCLL', '55BC', 'uint'),
- elem('MaxFALL', '55BD', 'uint'),
- elem('MasteringMetadata', '55D0', {
- elem('PrimaryRChromaticityX', '55D1', 'float'),
- elem('PrimaryRChromaticityY', '55D2', 'float'),
- elem('PrimaryGChromaticityX', '55D3', 'float'),
- elem('PrimaryGChromaticityY', '55D4', 'float'),
- elem('PrimaryBChromaticityX', '55D5', 'float'),
- elem('PrimaryBChromaticityY', '55D6', 'float'),
- elem('WhitePointChromaticityX', '55D7', 'float'),
- elem('WhitePointChromaticityY', '55D8', 'float'),
- elem('LuminanceMax', '55D9', 'float'),
- elem('LuminanceMin', '55DA', 'float'),
- }),
- }),
- }),
- elem('Audio', 'e1', {
- elem('SamplingFrequency', 'b5', 'float'),
- elem('OutputSamplingFrequency', '78b5', 'float'),
- elem('Channels', '9f', 'uint'),
- elem('BitDepth', '6264', 'uint'),
- }),
- elem('ContentEncodings', '6d80', {
- elem('ContentEncoding*', '6240', {
- elem('ContentEncodingOrder', '5031', 'uint'),
- elem('ContentEncodingScope', '5032', 'uint'),
- elem('ContentEncodingType', '5033', 'uint'),
- elem('ContentCompression', '5034', {
- elem('ContentCompAlgo', '4254', 'uint'),
- elem('ContentCompSettings', '4255', 'binary'),
- }),
- }),
- }),
- }),
- }),
-
- elem('Cues', '1c53bb6b', {
- elem('CuePoint*', 'bb', {
- elem('CueTime', 'b3', 'uint'),
- elem('CueTrackPositions*', 'b7', {
- elem('CueTrack', 'f7', 'uint'),
- elem('CueClusterPosition', 'f1', 'uint'),
- elem('CueRelativePosition','f0', 'uint'),
- elem('CueDuration', 'b2', 'uint'),
- }),
- }),
- }),
-
- elem('Attachments', '1941a469', {
- elem('AttachedFile*', '61a7', {
- elem('FileDescription', '467e', 'str'),
- elem('FileName', '466e', 'str'),
- elem('FileMimeType', '4660', 'str'),
- elem('FileData', '465c', 'binary'),
- elem('FileUID', '46ae', 'uint'),
- }),
- }),
-
- elem('Chapters', '1043a770', {
- elem('EditionEntry*', '45b9', {
- elem('EditionUID', '45bc', 'uint'),
- elem('EditionFlagHidden', '45bd', 'uint'),
- elem('EditionFlagDefault', '45db', 'uint'),
- elem('EditionFlagOrdered', '45dd', 'uint'),
- elem('ChapterAtom*', 'b6', {
- elem('ChapterUID', '73c4', 'uint'),
- elem('ChapterTimeStart', '91', 'uint'),
- elem('ChapterTimeEnd', '92', 'uint'),
- elem('ChapterFlagHidden', '98', 'uint'),
- elem('ChapterFlagEnabled', '4598', 'uint'),
- elem('ChapterSegmentUID', '6e67', 'binary'),
- elem('ChapterSegmentEditionUID', '6ebc', 'uint'),
- elem('ChapterDisplay*', '80', {
- elem('ChapString', '85', 'str'),
- elem('ChapLanguage*', '437c', 'str'),
- elem('ChapCountry*', '437e', 'str'),
- }),
- }),
- }),
- }),
- elem('Tags*', '1254c367', {
- elem('Tag*', '7373', {
- elem('Targets', '63c0', {
- elem('TargetTypeValue', '68ca', 'uint'),
- elem('TargetTrackUID', '63c5', 'uint'),
- elem('TargetEditionUID', '63c9', 'uint'),
- elem('TargetChapterUID', '63c4', 'uint'),
- elem('TargetAttachmentUID', '63c6', 'uint'),
- }),
- elem('SimpleTag*', '67c8', {
- elem('TagName', '45a3', 'str'),
- elem('TagLanguage', '447a', 'str'),
- elem('TagString', '4487', 'str'),
- }),
- }),
- }),
- }),
- );
-}
-
-1;
diff --git a/TOOLS/lib/Parse/Matroska/Element.pm b/TOOLS/lib/Parse/Matroska/Element.pm
deleted file mode 100644
index fa0830c11e..0000000000
--- a/TOOLS/lib/Parse/Matroska/Element.pm
+++ /dev/null
@@ -1,331 +0,0 @@
-use 5.008;
-use strict;
-use warnings;
-
-# ABSTRACT: a mid-level representation of an EBML element
-package Parse::Matroska::Element;
-
-use Carp;
-use List::Util qw{first};
-
-=head1 SYNOPSIS
-
- use Parse::Matroska::Reader;
- my $reader = Parse::Matroska::Reader->new($path);
- my $elem = $reader->read_element;
-
- print "ID: $elem->{elid}\n";
- print "Name: $elem->{name}\n";
- print "Length: $elem->{content_len}\n";
- print "Type: $elem->{type}\n";
- print "Child count: ", scalar(@{$elem->all_children}), "\n";
- if ($elem->{type} eq 'sub') {
- while (my $chld = $elem->next_child) {
- print "Child Name: $chld->{name}\n";
- }
- } else {
- print "Value: ", $elem->get_value, "\n";
- }
-
-=head1 DESCRIPTION
-
-Represents a single Matroska element as decoded by
-L<Parse::Matroska::Reader>. This is essentially a hash
-augmented with functions for delay-loading of binary
-values and children elements.
-
-=head1 NOTE
-
-The API of this module is not yet considered stable.
-
-=attr elid
-
-The EBML Element ID, suitable for passing to
-L<Parse::Matroska::Definitions/elem_by_hexid>.
-
-=attr name
-
-The EBML Element's name.
-
-=attr type
-
-The EBML Element's type. Can be C<uint>, C<sint>,
-C<float>, C<ebml_id>, C<str> or C<binary>. See L</value>
-for details.
-
-Equivalent to
-C<elem_by_hexid($elem-E<gt>{value})-E<gt>{valtype}>.
-
-=attr value
-
-The EBML Element's value. Should be obtained through
-L</get_value>.
-
-Is an unicode string if the L</type> is C<str>, that is,
-the string has already been decoded by L<Encode/decode>.
-
-Is C<undef> if the L</type> is C<binary> and the contents
-were delay-loaded and not yet read. L</get_value> will
-do the delayed load if needed.
-
-Is an arrayref if the L</type> is C<sub>, containing
-the children nodes that were already loaded.
-
-Is a hashref if the L</type> is C<ebml_id>, containing
-the referred element's information as defined in
-L<Parse::Matroska::Definitions>. Calling
-C<elem_by_hexid($elem-E<gt>{value}-E<gt>{elid})> will
-return the same object as $elem->{value}.
-
-=attr full_len
-
-The entire length of this EBML Element, including
-the header's.
-
-=attr size_len
-
-The length of the size marker. Used when calculating
-L</full_len> from L</content_len>
-
-=attr content_len
-
-The length of the contents of this EBML Element,
-which excludes the header.
-
-=attr reader
-
-A weakened reference to the associated
-L<Parse::Matroska::Reader>.
-
-=method new(%hash)
-
-Creates a new Element initialized with the hash
-given as argument.
-
-=cut
-sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- $self->initialize(@_);
- return $self;
-}
-
-=method initialize(%hash)
-
-Called by L</new> on initialization.
-
-=cut
-sub initialize {
- my ($self, %args) = @_;
- for (keys %args) {
- $self->{$_} = $args{$_};
- }
- $self->{depth} = 0 unless $self->{depth};
-}
-
-=method skip
-
-Called by the user to ignore the contents of this EBML node.
-Needed when ignoring the children of a node.
-
-=cut
-sub skip {
- my ($self) = @_;
- my $reader = $self->{reader};
- return unless $reader; # we don't have to skip if there's no reader
- my $pos = $reader->getpos;
- croak "Too late to skip, reads were already done"
- if $pos ne $self->{data_pos};
- $reader->skip($self->{content_len});
-}
-
-=method get_value($keep_bin)
-
-Returns the value contained by this EBML element.
-
-If the element has children, returns an arrayref to
-the children elements that were already encountered.
-
-If the element's type is C<binary> and the value was
-delay-loaded, does the reading now.
-
-If $keep_bin is true, the delay-loaded data is kept
-as the L</value>, otherwise, further calls to
-C<get_value> will reread the data from the L</reader>.
-
-=cut
-sub get_value {
- my ($self, $keep_bin) = @_;
-
- return undef if $self->{type} eq 'skip';
- return $self->{value} if $self->{value};
-
- my $reader = $self->{reader} or
- croak "The associated Reader has been deleted";
-
- # delay-loaded 'binary'
- if ($self->{type} eq 'binary') {
- croak "Cannot seek in the current Reader" unless $self->{data_pos};
- # seek to the data position...
- $reader->setpos($self->{data_pos});
- # read the data, keeping it in value if requested
- if ($keep_bin) {
- $self->{value} = $reader->readlen($self->{content_len});
- return $self->{value};
- } else {
- return $reader->readlen($self->{content_len});
- }
- }
-}
-
-=method next_child($read_bin)
-
-Builtin iterator; reads and returns the next child element.
-Always returns undef if the type isn't C<sub>.
-
-Returns undef at the end of the iterator and resets itself to
-point to the first element; so calling L</next_child($read_bin)>
-after the iterator returned C<undef> will return the first child.
-
-The optional C<$read_bin> parameter has the children elements
-not delay-load their value if their type is C<binary>.
-
-If all children elements have already been read, return
-each element in-order as would be given by
-L</all_children($recurse,$read_bin)>.
-
-=cut
-sub next_child {
- my ($self, $read_bin) = @_;
- return unless $self->{type} eq 'sub';
-
- if ($self->{_all_children_read}) {
- my $idx = $self->{_last_child} ||= 0;
- if ($idx == @{$self->{value}}) {
- # reset the iterator, returning undef once
- $self->{_last_child} = 0;
- return;
- }
- my $ret = $self->{value}->[$idx];
-
- ++$idx;
- $self->{_last_child} = $idx;
- return $ret;
- }
-
- my $len = defined $self->{remaining_len}
- ? $self->{remaining_len}
- : $self->{content_len};
-
- if ($len == 0) {
- # we've read all children; switch into $self->{value} iteration mode
- $self->{_all_children_read} = 1;
- # return undef since the iterator will reset
- return;
- }
-
- $self->{pos_offset} ||= 0;
- my $pos = $self->{data_pos};
- my $reader = $self->{reader} or croak "The associated reader has been deleted";
- $reader->setpos($pos);
- $reader->{fh}->seek($self->{pos_offset}, 1) if $pos;
-
- my $chld = $reader->read_element($read_bin);
- return undef unless defined $chld;
- $self->{pos_offset} += $chld->{full_len};
-
- $self->{remaining_len} = $len - $chld->{full_len};
-
- if ($self->{remaining_len} < 0) {
- croak "Child elements consumed $self->{remaining_len} more bytes than parent $self->{name} contained";
- }
-
- $chld->{depth} = $self->{depth} + 1;
- $self->{value} ||= [];
-
- push @{$self->{value}}, $chld;
-
- return $chld;
-}
-
-=method all_children($recurse,$read_bin)
-
-Calls L</populate_children($recurse,$read_bin)> on self
-and returns an arrayref with the children nodes.
-
-Both C<$recurse> and C<$read_bin> are optional and default
-to false.
-
-=cut
-sub all_children {
- my ($self, $recurse, $read_bin) = @_;
- $self->populate_children($recurse, $read_bin);
- return $self->{value};
-}
-
-=method children_by_name($name)
-
-Searches in the already read children elements for all
-elements with the EBML name C<$name>. Returns an array
-containing all found elements. On scalar context,
-returns only the first element found.
-
-Croaks if the element's C<type> isn't C<sub>.
-
-=cut
-sub children_by_name {
- my ($self, $name) = @_;
- return unless defined wantarray; # don't do work if work isn't wanted
- croak "Element can't have children" unless $self->{type} eq 'sub';
-
- my @found = grep { $_->{name} eq $name } @{$self->{value}};
- return @found if wantarray; # list
- return shift @found if defined wantarray; # scalar
-}
-
-=method populate_children($recurse,$read_bin)
-
-Populates the internal array of children elements, that is,
-requests that the associated L<Matroska::Parser::Reader> reads
-all children elements. Returns itself.
-
-Returns false if the element's C<type> isn't C<sub>.
-
-If C<$recurse> is provided and is true, the method will call
-itself in the children elements with the same parameters it
-received; this will build a full EBML tree.
-
-If C<$read_bin> is provided and is true, disables delay-loading
-of the contents of C<binary>-type nodes, reading the contents
-to memory.
-
-If both C<$recurse> and C<$read_bin> are true, entire EBML trees
-can be loaded without requiring seeks, thus behaving correctly
-on unseekable streams. If C<$read_bin> is false, the entire EBML
-tree is still loaded, but calling L</get_value> on C<binary>-type
-nodes will produce an error on unseekable streams.
-
-=cut
-sub populate_children {
- my ($self, $recurse, $read_bin) = @_;
-
- return unless $self->{type} eq 'sub';
-
- if (@{$self->{value}} && $recurse) {
- # only recurse
- foreach (@{$self->{value}}) {
- $_->populate_children($recurse, $read_bin);
- }
- return $self;
- }
-
- while (my $chld = $self->next_child($read_bin)) {
- $chld->populate_children($recurse, $read_bin) if $recurse;
- }
-
- return $self;
-}
-
-1;
diff --git a/TOOLS/lib/Parse/Matroska/Reader.pm b/TOOLS/lib/Parse/Matroska/Reader.pm
deleted file mode 100644
index 614b7b12c0..0000000000
--- a/TOOLS/lib/Parse/Matroska/Reader.pm
+++ /dev/null
@@ -1,426 +0,0 @@
-use 5.008;
-use strict;
-use warnings;
-
-# ABSTRACT: a low-level reader for EBML files
-package Parse::Matroska::Reader;
-
-use Parse::Matroska::Definitions qw{elem_by_hexid};
-use Parse::Matroska::Element;
-
-use Carp;
-use Scalar::Util qw{openhandle weaken};
-use IO::Handle;
-use IO::File;
-use List::Util qw{first};
-use Encode;
-
-use constant BIGINT_TRY => 'Pari,GMP,FastCalc';
-use Math::BigInt try => BIGINT_TRY;
-use Math::BigRat try => BIGINT_TRY;
-
-=head1 SYNOPSIS
-
- use Parse::Matroska::Reader;
- my $reader = Parse::Matroska::Reader->new($path);
- $reader->close;
- $reader->open(\$string_with_matroska_data);
-
- my $elem = $reader->read_element;
- print "Element ID: $elem->{elid}\n";
- print "Element name: $elem->{name}\n";
- if ($elem->{type} ne 'sub') {
- print "Element value: $elem->get_value\n";
- } else {
- while (my $child = $elem->next_child) {
- print "Child element: $child->{name}\n";
- }
- }
- $reader->close;
-
-=head1 DESCRIPTION
-
-Reads EBML data, which is used in Matroska files.
-This is a low-level reader which is meant to be used as a backend
-for higher level readers. TODO: write the high level readers :)
-
-=head1 NOTE
-
-The API of this module is not yet considered stable.
-
-=method new
-
-Creates a new reader.
-Calls L</open($arg)> with its arguments if provided.
-
-=cut
-sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- $self->open(@_) if @_;
- return $self;
-}
-
-=method open($arg)
-
-Creates the internal filehandle. The argument can be:
-
-=for :list
-* An open filehandle or L<IO::Handle> object.
-The filehandle is not C<dup()>ed, so calling L</close> in this
-object will close the given filehandle as well.
-* A scalar containing a path to a file.
-* On perl v5.14 or newer, a scalarref pointing to EBML data.
-For similar functionality in older perls, give an L<IO::String> object
-or the handle to an already C<open>ed scalarref.
-
-=cut
-sub open {
- my ($self, $arg) = @_;
- $self->{fh} = openhandle($arg) || IO::File->new($arg, "<:raw")
- or croak "Can't open $arg: $!";
-}
-
-=method close
-
-Closes the internal filehandle.
-
-=cut
-sub close {
- my ($self) = @_;
- $self->{fh}->close;
- delete $self->{fh};
-}
-
-# equivalent to $self->readlen(1), possibly faster
-sub _getc {
- my ($self) = @_;
- my $c = $self->{fh}->getc;
- croak "Can't do read of length 1: $!" if !defined $c && $!;
- return $c;
-}
-
-=method readlen($length)
-
-Reads C<$length> bytes from the internal filehandle.
-
-=cut
-sub readlen {
- my ($self, $len) = @_;
- my $data;
- my $readlen = $self->{fh}->read($data, $len);
- croak "Can't do read of length $len: $!"
- unless defined $readlen;
- return $data;
-}
-
-# converts a byte string into an integer
-# we do so by converting the integer into a hex string (big-endian)
-# and then reading the hex-string into an integer
-sub _bin2int($) {
- my ($bin) = @_;
- # if the length is larger than 3
- # the resulting integer might be larger than INT_MAX
- if (length($bin) > 3) {
- return Math::BigInt->from_hex(unpack("H*", $bin));
- }
- return hex(unpack("H*", $bin));
-}
-
-# creates a floating-point number with the given mantissa and exponent
-sub _ldexp {
- my ($mantissa, $exponent) = @_;
- my $r = new Math::BigRat($mantissa);
- return $r * Math::BigRat->new(2)**$exponent;
-}
-
-# NOTE: the read_* functions are hard to read because they're ports
-# of even harder to read python functions.
-# TODO: make them readable
-
-=method read_id
-
-Reads an EBML ID atom in hexadecimal string format, suitable
-for passing to L<Parse::Matroska::Definitions/elem_by_hexid($id)>.
-
-=cut
-sub read_id {
- my ($self) = @_;
- my $t = $self->_getc;
- return undef unless defined $t;
- my $i = 0;
- my $mask = 1<<7;
-
- if (ord($t) == 0) {
- croak "Matroska Syntax error: first byte of ID was \\0"
- }
- until (ord($t) & $mask) {
- ++$i;
- $mask >>= 1;
- }
- # return hex string of the bytes we just read
- return unpack "H*", ($t . $self->readlen($i));
-}
-
-=method read_size
-
-Reads an EBML Data Size atom, which immediately follows
-an EBML ID atom.
-
-This returns an array consisting of:
-
-=for :list
-0. The length of the Data Size atom.
-1. The value encoded in the Data Size atom, which is the length of all the data following it.
-
-=cut
-sub read_size {
- my ($self) = @_;
- my $t = $self->_getc;
- my $i = 0;
- my $mask = 1<<7;
-
- if (ord($t) == 0) {
- croak "Matroska Syntax error: first byte of data size was \\0"
- }
- until (ord($t) & $mask) {
- ++$i;
- $mask >>= 1;
- }
- $t = $t & chr($mask-1); # strip length bits (keep only significant bits)
- return ($i+1, _bin2int $t . $self->readlen($i));
-}
-
-=method read_str($length)
-
-Reads a string of length C<$length> bytes from the internal filehandle.
-The string is already L<Encode/decode>d from C<UTF-8>, which is the
-standard Matroska string encoding.
-
-=cut
-{
- my $utf8 = find_encoding("UTF-8");
- sub read_str {
- my ($self, $length) = @_;
- return $utf8->decode($self->readlen($length));
- }
-}
-
-=method read_uint($length)
-
-Reads an unsigned integer of length C<$length> bytes
-from the internal filehandle.
-
-Returns a L<Math::BigInt> object if C<$length> is greater
-than 4.
-
-=cut
-sub read_uint {
- my ($self, $length) = @_;
- return _bin2int $self->readlen($length);
-}
-
-=method read_sint($length)
-
-Reads a signed integer of length C<$length> bytes
-from the internal filehandle.
-
-Returns a L<Math::BigInt> object if C<$length> is greater
-than 4.
-
-=cut
-sub read_sint {
- my ($self, $length) = @_;
- my $i = $self->read_uint($length);
-
- # Apply 2's complement to the unsigned int
- my $mask = int(2 ** ($length * 8 - 1));
- # if the most significant bit is set...
- if ($i & $mask) {
- # subtract the MSB twice
- $i -= 2 * $mask;
- }
- return $i;
-}
-
-=method read_float($length)
-
-Reads an IEEE floating point number of length C<$length>
-bytes from the internal filehandle.
-
-Only lengths C<4> and C<8> are supported (C C<float> and C<double>).
-
-=cut
-{
- my $b1 = new Math::BigInt 1;
-
- sub read_float {
- my ($self, $length) = @_;
- my $i = new Math::BigInt $self->read_uint($length)->bstr;
- my $f;
-
- # These evil expressions reinterpret an unsigned int as IEEE binary floats
- if ($length == 4) {
- $f = _ldexp(($i & ((1<<23) - 1)) + (1<<23), ($i>>23 & ((1<<8) - 1)) - 150);
- $f = -$f if $i & ($b1<<31);
- } elsif ($length == 8) {
- $f = _ldexp(($i & (($b1<<52) - 1)) + ($b1<<52), ($i>>52 & ((1<<12) - 1)) - 1075);
- $f = -$f if $i & ($b1<<63);
- } else {
- croak "Matroska Syntax error: unsupported IEEE float byte size $length";
- }
-
- return $f;
- }
-}
-
-=method read_ebml_id($length)
-
-Reads an EBML ID when it's encoded as the data inside another
-EBML element, that is, when the enclosing element's C<type> is
-C<ebml_id>.
-
-This returns a hashref with the EBML element description as
-defined in L<Parse::Matroska::Definitions>.
-
-=cut
-sub read_ebml_id {
- my ($self, $length) = @_;
- return elem_by_hexid(unpack("H*", $self->readlen($length)));
-}
-
-=method skip($length)
-
-Skips C<$length> bytes in the internal filehandle.
-
-=cut
-sub skip {
- my ($self, $len) = @_;
- return if $self->{fh}->can('seek') && $self->{fh}->seek($len, 1);
- $self->readlen($len);
- return;
-}
-
-=method getpos
-
-Wrapper for L<IO::Seekable/$io-E<gt>getpos> in the internal filehandle.
-
-Returns undef if the internal filehandle can't C<getpos>.
-
-=cut
-sub getpos {
- my ($self) = @_;
- return undef unless $self->{fh}->can('getpos');
- return $self->{fh}->getpos;
-}
-
-=method setpos($pos)
-
-Wrapper for L<IO::Seekable/$io-E<gt>setpos> in the internal filehandle.
-
-Returns C<undef> if the internal filehandle can't C<setpos>.
-
-Croaks if C<setpos> does not seek to the requested position,
-that is, if calling C<getpos> does not yield the same object
-as the C<$pos> argument.
-
-=cut
-sub setpos {
- my ($self, $pos) = @_;
- return undef unless $pos && $self->{fh}->can('setpos');
-
- my $ret = $self->{fh}->setpos($pos);
- croak "Cannot seek to correct position"
- unless $self->getpos eq $pos;
- return $ret;
-}
-
-=method read_element($read_bin)
-
-Reads a full EBML element from the internal filehandle.
-
-Returns a L<Parse::Matroska::Element> object initialized with
-the read data. If C<read_bin> is not present or is false, will
-delay-load the contents of C<binary> type elements, that is,
-they will only be loaded when calling C<get_value> on the
-returned L<Parse::Matroska::Element> object.
-
-Does not read the children of the element if its type is
-C<sub>. Look into the L<Parse::Matroska::Element> interface
-for details in how to read children elements.
-
-Pass a true C<$read_bin> if the stream being read is not
-seekable (C<getpos> is undef) and the contents of C<binary>
-elements is desired, otherwise seeking errors or internal
-filehandle corruption might occur.
-
-=cut
-sub read_element {
- my ($self, $read_bin) = @_;
- return undef if $self->{fh}->eof;
-
- my $elem_pos = $self->getpos;
-
- my $elid = $self->read_id;
- my $elem_def = elem_by_hexid($elid);
- my ($size_len, $content_len) = $self->read_size;
- my $full_len = length($elid)/2 + $size_len + $content_len;
-
- my $elem = Parse::Matroska::Element->new(
- elid => $elid,
- name => $elem_def && $elem_def->{name},
- type => $elem_def && $elem_def->{valtype},
- size_len => $size_len,
- content_len => $content_len,
- full_len => $full_len,
- reader => $self,
- elem_pos => $elem_pos,
- data_pos => $self->getpos,
- );
- weaken($elem->{reader});
-
- if (defined $elem_def) {
- if ($elem->{type} eq 'sub') {
- $elem->{value} = [];
- } elsif ($elem->{type} eq 'str') {
- $elem->{value} = $self->read_str($content_len);
- } elsif ($elem->{type} eq 'ebml_id') {
- $elem->{value} = $self->read_ebml_id($content_len);
- } elsif ($elem->{type} eq 'uint') {
- $elem->{value} = $self->read_uint($content_len);
- } elsif ($elem->{type} eq 'sint') {
- $elem->{value} = $self->read_sint($content_len);
- } elsif ($elem->{type} eq 'float') {
- $elem->{value} = $self->read_float($content_len);
- } elsif ($elem->{type} eq 'skip') {
- $self->skip($content_len);
- } elsif ($elem->{type} eq 'binary') {
- if ($read_bin) {
- $elem->{value} = $self->readlen($content_len);
- } else {
- $self->skip($content_len);
- }
- } else {
- die "Matroska Definition error: type $elem->{valtype} unknown"
- }
- } else {
- $self->skip($content_len);
- }
- return $elem;
-}
-
-1;
-
-=head1 CAVEATS
-
-Children elements have to be processed as soon as an element
-with children is found, or their children ignored with
-L<Parse::Matroska::Element/skip>. Not doing so doesn't cause
-errors but results in an invalid structure, with constant C<0>
-depth.
-
-To work correctly in unseekable streams, either the contents
-of C<binary>-type elements has to be ignored or the C<read_bin>
-flag to C<read_element> has to be true.
diff --git a/TOOLS/lib/Parse/Matroska/Utils.pm b/TOOLS/lib/Parse/Matroska/Utils.pm
deleted file mode 100644
index 127d626cb1..0000000000
--- a/TOOLS/lib/Parse/Matroska/Utils.pm
+++ /dev/null
@@ -1,37 +0,0 @@
-use strict;
-use warnings;
-
-# ABSTRACT: internally-used helper functions
-package Parse::Matroska::Utils;
-
-use Exporter;
-our @ISA = qw{Exporter};
-our @EXPORT_OK = qw{uniq uncamelize};
-
-=method uniq(@array)
-
-The same as L<List::MoreUtils/"uniq LIST">.
-Included to avoid depending on it since it's
-not a core module.
-
-=cut
-sub uniq(@) {
- my %seen;
- return grep { !$seen{$_}++ } @_;
-}
-
-=method uncamelize($string)
-
-Converts a "StringLikeTHIS" into a
-"string_like_this".
-
-=cut
-sub uncamelize($) {
- local $_ = shift;
- # lc followed by UC: lc_UC
- s/(?<=[a-z])([A-Z])/_\L$1/g;
- # UC followed by two lc: _UClclc
- s/([A-Z])(?=[a-z]{2})/_\L$1/g;
- # strip leading _ that the second regexp might add; lowercase all
- s/^_//; lc
-}
diff --git a/TOOLS/matroska.pl b/TOOLS/matroska.pl
deleted file mode 100755
index 41e4f6aa81..0000000000
--- a/TOOLS/matroska.pl
+++ /dev/null
@@ -1,169 +0,0 @@
-#! /usr/bin/env perl
-
-# Generate C definitions for parsing Matroska files.
-
-use strict;
-use warnings;
-
-use FindBin;
-use lib "$FindBin::Bin/lib";
-use Parse::Matroska::Definitions;
-use Parse::Matroska::Reader;
-
-use Getopt::Long;
-use List::Util qw{max};
-
-my @global_elem_list = @Parse::Matroska::Definitions::global_elem_list;
-
-Getopt::Long::Configure(qw{auto_version auto_help});
-my %opt;
-GetOptions(\%opt,
- "generate-header",
- "generate-definitions",
- "full",
- );
-
-if ($opt{"generate-header"}) {
- generate_c_header();
-} elsif ($opt{"generate-definitions"}) {
- generate_c_definitions();
-} else {
- for (@ARGV) {
- my $reader = Parse::Matroska::Reader->new($_ eq '-' ? \*STDIN : $_) or die $!;
- while (my $elem = $reader->read_element($_ eq '-')) {
- process_elem($elem, $_ eq '-');
- }
- }
-}
-
-# Generate declarations for libmpdemux/ebml_types.h
-sub generate_c_header {
- print "/* Generated by TOOLS/matroska.pl, do not edit manually */\n\n";
-
- # Write a #define for the ElementID of each known element
- for my $el (@global_elem_list) {
- printf "#define %-40s 0x%s\n", $el->{definename}, $el->{elid};
- }
- print "\n";
-
- # Define a struct for each ElementID that has child elements
- for my $el (@global_elem_list) {
- next unless $el->{subelements};
- print "\nstruct $el->{structname} {\n";
-
- # Figure out the length of the longest variable name
- # Used for pretty-printing in the next step
- my $l = max(map { length $_->{valname} } values %{$el->{subelements}});
-
- # Output each variable, with pointers for array (multiple) elements
- for my $subel (sort { $a->{definename} cmp $b->{definename} } values %{$el->{subelements}}) {
- printf " %-${l}s %s%s;\n",
- $subel->{valname}, $subel->{multiple}?'*':' ', $subel->{fieldname};
- }
- print "\n";
-
- # Output a counter variable for each element
- # (presence/absence for scalars, item count for arrays)
- for my $subel (sort values %{$el->{subelements}}) {
- print " int n_$subel->{fieldname};\n"
- }
- print "};\n";
- }
- print "\n";
-
- # Output extern references for ebml_elem_desc structs for each of the elements
- # These are defined by generate_c_definitions
- for my $el (@global_elem_list) {
- next unless $el->{subelements};
- print "extern const struct ebml_elem_desc $el->{structname}_desc;\n";
- }
- print "\n";
-
- # Output the max number of sub-elements a known element might have
- printf "#define MAX_EBML_SUBELEMENTS %d\n",
- max(map { scalar keys %{$_->{subelements}} }
- grep { $_->{subelements} } @global_elem_list);
-}
-
-# Generate definitions for libmpdemux/ebml_defs.c
-sub generate_c_definitions {
- print "/* Generated by TOOLS/matroska.pl, do not edit manually */\n\n";
- # ebml_defs.c uses macros declared in ebml.c
- for my $el (@global_elem_list) {
- print "\n";
- if ($el->{subelements}) {
- # set N for the next macros
- print "#define N $el->{fieldname}\n";
-
- # define a struct ebml_$N_desc and gets ready to define fields
- # this secretly opens two scopes; hence the }}; at the end
- print "E_S(\"$el->{name}\", ".scalar(keys %{$el->{subelements}}).")\n";
-
- # define a field for each subelement
- # also does lots of macro magic, but doesn't open a scope
- for my $subel (sort { $a->{definename} cmp $b->{definename} } values %{$el->{subelements}}) {
- print "F($subel->{definename}, $subel->{fieldname}, ".
- ($subel->{multiple}?'1':'0').")\n";
- }
- # close the struct
- print "}};\n";
-
- # unset N since we've used it
- print "#undef N\n";
- } else {
- print "E(\"$el->{name}\", $el->{fieldname}, $el->{ebmltype})\n";
- }
- }
-}
-
-sub repr {
- my @ret;
- foreach (@_) {
- if (/'/) {
- s/"/\\"/g;
- push @ret, "\"$_\"";
- } else {
- push @ret, "'$_'";
- }
- }
- return @ret if wantarray;
- return pop @ret if defined wantarray;
- return;
-}
-
-sub process_elem {
- my ($elem, $read_bin) = @_;
- unless ($opt{full}) {
- if ($elem->{name} eq 'Cluster' || $elem->{name} eq 'Cues') {
- $elem->skip;
- return;
- }
- }
- die unless $elem;
-
- if ($elem->{type} ne 'skip') {
- print "$elem->{depth} $elem->{elid} $elem->{name} size: $elem->{content_len} value: ";
- }
-
- if ($elem->{type} eq 'sub') {
- print "subelements:\n";
- while (my $chld = $elem->next_child($read_bin)) {
- process_elem($chld);
- }
- } elsif ($elem->{type} eq 'binary') {
- my $t = "<skipped $elem->{content_len} bytes>";
- if ($elem->{content_len} < 20) {
- $t = unpack "H*", $elem->get_value;
- }
- print "binary $t\n";
- delete $elem->{value};
- } elsif ($elem->{type} eq 'ebml_id') {
- print "binary $elem->{value}->{elid} (".($elem->{value}->{name}||"UNKNOWN").")\n";
- } elsif ($elem->{type} eq 'skip') {
- # skip
- } elsif ($elem->{type} eq 'str') {
- print "string ". repr($elem->get_value) . "\n";
- } else {
- print "$elem->{type} ". $elem->get_value ."\n";
- }
-}
diff --git a/TOOLS/matroska.py b/TOOLS/matroska.py
new file mode 100755
index 0000000000..91e65a26b3
--- /dev/null
+++ b/TOOLS/matroska.py
@@ -0,0 +1,463 @@
+#!/usr/bin/env python
+"""
+Generate C definitions for parsing Matroska files.
+Can also be used to directly parse Matroska files and display their contents.
+"""
+
+#
+# This file is part of MPlayer.
+#
+# MPlayer is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# MPlayer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with MPlayer; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+# for compatibility with Python 2.x
+from __future__ import print_function
+
+elements_ebml = (
+ 'EBML, 1a45dfa3, sub', (
+ 'EBMLVersion, 4286, uint',
+ 'EBMLReadVersion, 42f7, uint',
+ 'EBMLMaxIDLength, 42f2, uint',
+ 'EBMLMaxSizeLength, 42f3, uint',
+ 'DocType, 4282, str',
+ 'DocTypeVersion, 4287, uint',
+ 'DocTypeReadVersion, 4285, uint',
+ ),
+
+ 'CRC32, bf, binary',
+ 'Void, ec, binary',
+)
+
+elements_matroska = (
+ 'Segment, 18538067, sub', (
+
+ 'SeekHead*, 114d9b74, sub', (
+ 'Seek*, 4dbb, sub', (
+ 'SeekID, 53ab, ebml_id',
+ 'SeekPosition, 53ac, uint',
+ ),
+ ),
+
+ 'Info*, 1549a966, sub', (
+ 'SegmentUID, 73a4, binary',
+ 'PrevUID, 3cb923, binary',
+ 'NextUID, 3eb923, binary',
+ 'TimecodeScale, 2ad7b1, uint',
+ 'DateUTC, 4461, sint',
+ 'Title, 7ba9, str',
+ 'MuxingApp, 4d80, str',
+ 'WritingApp, 5741, str',
+ 'Duration, 4489, float',
+ ),
+
+ 'Cluster*, 1f43b675, sub', (
+ 'Timecode, e7, uint',
+ 'BlockGroup*, a0, sub', (
+ 'Block, a1, binary',
+ 'BlockDuration, 9b, uint',
+ 'ReferenceBlock*, fb, sint',
+ 'DiscardPadding, 75A2, sint',
+ ),
+ 'SimpleBlock*, a3, binary',
+ ),
+
+ 'Tracks*, 1654ae6b, sub', (
+ 'TrackEntry*, ae, sub', (
+ 'TrackNumber, d7, uint',
+ 'TrackUID, 73c5, uint',
+ 'TrackType, 83, uint',
+ 'FlagEnabled, b9, uint',
+ 'FlagDefault, 88, uint',
+ 'FlagForced, 55aa, uint',
+ 'FlagLacing, 9c, uint',
+ 'MinCache, 6de7, uint',
+ 'MaxCache, 6df8, uint',
+ 'DefaultDuration, 23e383, uint',
+ 'TrackTimecodeScale, 23314f, float',
+ 'MaxBlockAdditionID, 55ee, uint',
+ 'Name, 536e, str',
+ 'Language, 22b59c, str',
+ 'CodecID, 86, str',
+ 'CodecPrivate, 63a2, binary',
+ 'CodecName, 258688, str',
+ 'CodecDecodeAll, aa, uint',
+ 'CodecDelay, 56aa, uint',
+ 'SeekPreRoll, 56bb, uint',
+ 'Video, e0, sub', (
+ 'FlagInterlaced, 9a, uint',
+ 'PixelWidth, b0, uint',
+ 'PixelHeight, ba, uint',
+ 'DisplayWidth, 54b0, uint',
+ 'DisplayHeight, 54ba, uint',
+ 'DisplayUnit, 54b2, uint',
+ 'FrameRate, 2383e3, float',
+ 'ColourSpace, 2eb524, binary',
+ 'StereoMode, 53b8, uint',
+ 'Colour, 55b0, sub', (
+ 'MatrixCoefficients, 55B1, uint',
+ 'BitsPerChannel, 55B2, uint',
+ 'ChromaSubsamplingHorz, 55B3, uint',
+ 'ChromaSubsamplingVert, 55B4, uint',
+ 'CbSubsamplingHorz, 55B5, uint',
+ 'CbSubsamplingVert, 55B6, uint',
+ 'ChromaSitingHorz, 55B7, uint',
+ 'ChromaSitingVert, 55B8, uint',
+ 'Range, 55B9, uint',
+ 'TransferCharacteristics, 55BA, uint',
+ 'Primaries, 55BB, uint',
+ 'MaxCLL, 55BC, uint',
+ 'MaxFALL, 55BD, uint',
+ 'MasteringMetadata, 55D0, sub', (
+ 'PrimaryRChromaticityX, 55D1, float',
+ 'PrimaryRChromaticityY, 55D2, float',
+ 'PrimaryGChromaticityX, 55D3, float',
+ 'PrimaryGChromaticityY, 55D4, float',
+ 'PrimaryBChromaticityX, 55D5, float',
+ 'PrimaryBChromaticityY, 55D6, float',
+ 'WhitePointChromaticityX, 55D7, float',
+ 'WhitePointChromaticityY, 55D8, float',
+ 'LuminanceMax, 55D9, float',
+ 'LuminanceMin, 55DA, float',
+ ),
+ ),
+ ),
+ 'Audio, e1, sub', (
+ 'SamplingFrequency, b5, float',
+ 'OutputSamplingFrequency, 78b5, float',
+ 'Channels, 9f, uint',
+ 'BitDepth, 6264, uint',
+ ),
+ 'ContentEncodings, 6d80, sub', (
+ 'ContentEncoding*, 6240, sub', (
+ 'ContentEncodingOrder, 5031, uint',
+ 'ContentEncodingScope, 5032, uint',
+ 'ContentEncodingType, 5033, uint',
+ 'ContentCompression, 5034, sub', (
+ 'ContentCompAlgo, 4254, uint',
+ 'ContentCompSettings, 4255, binary',
+ ),
+ ),
+ ),
+ ),
+ ),
+
+ 'Cues, 1c53bb6b, sub', (
+ 'CuePoint*, bb, sub', (
+ 'CueTime, b3, uint',
+ 'CueTrackPositions*, b7, sub', (
+ 'CueTrack, f7, uint',
+ 'CueClusterPosition, f1, uint',
+ 'CueRelativePosition, f0, uint',
+ 'CueDuration, b2, uint',
+ ),
+ ),
+ ),
+
+ 'Attachments, 1941a469, sub', (
+ 'AttachedFile*, 61a7, sub', (
+ 'FileDescription, 467e, str',
+ 'FileName, 466e, str',
+ 'FileMimeType, 4660, str',
+ 'FileData, 465c, binary',
+ 'FileUID, 46ae, uint',
+ ),
+ ),
+
+ 'Chapters, 1043a770, sub', (
+ 'EditionEntry*, 45b9, sub', (
+ 'EditionUID, 45bc, uint',
+ 'EditionFlagHidden, 45bd, uint',
+ 'EditionFlagDefault, 45db, uint',
+ 'EditionFlagOrdered, 45dd, uint',
+ 'ChapterAtom*, b6, sub', (
+ 'ChapterUID, 73c4, uint',
+ 'ChapterTimeStart, 91, uint',
+ 'ChapterTimeEnd, 92, uint',
+ 'ChapterFlagHidden, 98, uint',
+ 'ChapterFlagEnabled, 4598, uint',
+ 'ChapterSegmentUID, 6e67, binary',
+ 'ChapterSegmentEditionUID, 6ebc, uint',
+ 'ChapterDisplay*, 80, sub', (
+ 'ChapString, 85, str',
+ 'ChapLanguage*, 437c, str',
+ 'ChapCountry*, 437e, str',
+ ),
+ ),
+ ),
+ ),
+ 'Tags*, 1254c367, sub', (
+ 'Tag*, 7373, sub', (
+ 'Targets, 63c0, sub', (
+ 'TargetTypeValue, 68ca, uint',
+ 'TargetTrackUID, 63c5, uint',
+ 'TargetEditionUID, 63c9, uint',
+ 'TargetChapterUID, 63c4, uint',
+ 'TargetAttachmentUID, 63c6, uint',
+ ),
+ 'SimpleTag*, 67c8, sub', (
+ 'TagName, 45a3, str',
+ 'TagLanguage, 447a, str',
+ 'TagString, 4487, str'
+ ),
+ ),
+ ),
+ ),
+)
+
+
+import sys
+from math import ldexp
+from binascii import hexlify
+
+def byte2num(s):
+ return int(hexlify(s), 16)
+
+class EOF(Exception): pass
+
+def camelcase_to_words(name):
+ parts = []
+ start = 0
+ for i in range(1, len(name)):
+ if name[i].isupper() and (name[i-1].islower() or
+ name[i+1:i+2].islower()):
+ parts.append(name[start:i])
+ start = i
+ parts.append(name[start:])
+ return '_'.join(parts).lower()
+
+class MatroskaElement(object):
+
+ def __init__(self, name, elid, valtype, namespace):
+ self.name = name
+ self.definename = '{0}_ID_{1}'.format(namespace, name.upper())
+ self.fieldname = camelcase_to_words(name)
+ self.structname = 'ebml_' + self.fieldname
+ self.elid = elid
+ self.valtype = valtype
+ if valtype == 'sub':
+ self.ebmltype = 'EBML_TYPE_SUBELEMENTS'
+ self.valname = 'struct ' + self.structname
+ else:
+ self.ebmltype = 'EBML_TYPE_' + valtype.upper()
+ try:
+ self.valname = {'uint': 'uint64_t', 'str': 'char *',
+ 'binary': 'bstr', 'ebml_id': 'uint32_t',
+ 'float': 'double', 'sint': 'int64_t',
+ }[valtype]
+ except KeyError:
+ raise SyntaxError('Unrecognized value type ' + valtype)
+ self.subelements = ()
+
+ def add_subelements(self, subelements):
+ self.subelements = subelements
+ self.subids = set(x[0].elid for x in subelements)
+
+elementd = {}
+elementlist = []
+def parse_elems(l, namespace):
+ subelements = []
+ for el in l:
+ if isinstance(el, str):
+ name, hexid, eltype = [x.strip() for x in el.split(',')]
+ multiple = name.endswith('*')
+ name = name.strip('*')
+ new = MatroskaElement(name, hexid, eltype, namespace)
+ elementd[hexid] = new
+ elementlist.append(new)
+ subelements.append((new, multiple))
+ else:
+ new.add_subelements(parse_elems(el, namespace))
+ return subelements
+
+parse_elems(elements_ebml, 'EBML')
+parse_elems(elements_matroska, 'MATROSKA')
+
+def generate_C_header():
+ print('// Generated by TOOLS/matroska.py, do not edit manually')
+ print()
+
+ for el in elementlist:
+ print('#define {0.definename:40} 0x{0.elid}'.format(el))
+
+ print()
+
+ for el in reversed(elementlist):
+ if not el.subelements:
+ continue
+ print()
+ print('struct {0.structname} {{'.format(el))
+ l = max(len(subel.valname) for subel, multiple in el.subelements)+1
+ for subel, multiple in el.subelements:
+ print(' {e.valname:{l}} {star}{e.fieldname};'.format(
+ e=subel, l=l, star=' *'[multiple]))
+ print()
+ for subel, multiple in el.subelements:
+ print(' int n_{0.fieldname};'.format(subel))
+ print('};')
+
+ for el in elementlist:
+ if not el.subelements:
+ continue
+ print('extern const struct ebml_elem_desc {0.structname}_desc;'.format(
+ el))
+
+ print()
+ print('#define MAX_EBML_SUBELEMENTS', max(len(el.subelements)
+ for el in elementlist))
+
+
+
+def generate_C_definitions():
+ print('// Generated by TOOLS/matroska.py, do not edit manually')
+ print()
+ for el in reversed(elementlist):
+ print()
+ if el.subelements:
+ print('#define N', el.fieldname)
+ print('E_S("{0}", {1})'.format(el.name, len(el.subelements)))
+ for subel, multiple in el.subelements:
+ print('F({0.definename}, {0.fieldname}, {1})'.format(
+ subel, int(multiple)))
+ print('}};')
+ print('#undef N')
+ else:
+ print('E("{0.name}", {0.fieldname}, {0.ebmltype})'.format(el))
+
+def read(s, length):
+ t = s.read(length)
+ if len(t) != length:
+ raise EOF
+ return t
+
+def read_id(s):
+ t = read(s, 1)
+ i = 0
+ mask = 128
+ if ord(t) == 0:
+ raise SyntaxError
+ while not ord(t) & mask:
+ i += 1
+ mask >>= 1
+ t += read(s, i)
+ return t
+
+def read_vint(s):
+ t = read(s, 1)
+ i = 0
+ mask = 128
+ if ord(t) == 0:
+ raise SyntaxError
+ while not ord(t) & mask:
+ i += 1
+ mask >>= 1
+ t = bytes((ord(t) & (mask - 1),))
+ t += read(s, i)
+ return i+1, byte2num(t)
+
+def read_str(s, length):
+ return read(s, length)
+
+def read_uint(s, length):
+ t = read(s, length)
+ return byte2num(t)
+
+def read_sint(s, length):
+ i = read_uint(s, length)
+ mask = 1 << (length * 8 - 1)
+ if i & mask:
+ i -= 2 * mask
+ return i
+
+def read_float(s, length):
+ t = read(s, length)
+ i = byte2num(t)
+ if length == 4:
+ f = ldexp((i & 0x7fffff) + (1 << 23), (i >> 23 & 0xff) - 150)
+ if i & (1 << 31):
+ f = -f
+ elif length == 8:
+ f = ldexp((i & ((1 << 52) - 1)) + (1 << 52), (i >> 52 & 0x7ff) - 1075)
+ if i & (1 << 63):
+ f = -f
+ else:
+ raise SyntaxError
+ return f
+
+def parse_one(s, depth, parent, maxlen):
+ elid = hexlify(read_id(s)).decode('ascii')
+ elem = elementd.get(elid)
+ if parent is not None and elid not in parent.subids and elid not in ('ec', 'bf'):
+ print('Unexpected:', elid)
+ if 1:
+ raise NotImplementedError
+ size, length = read_vint(s)
+ this_length = len(elid) / 2 + size + length
+ if elem is not None:
+ if elem.valtype != 'skip':
+ print(depth, elid, elem.name, 'size:', length, 'value:', end=' ')
+ if elem.valtype == 'sub':
+ print('subelements:')
+ while length > 0:
+ length -= parse_one(s, depth + 1, elem, length)
+ if length < 0:
+ raise SyntaxError
+ elif elem.valtype == 'str':
+ print('string', repr(read_str(s, length).decode('utf8', 'replace')))
+ elif elem.valtype in ('binary', 'ebml_id'):
+ t = read_str(s, length)
+ dec = ''
+ if elem.valtype == 'ebml_id':
+ idelem = elementd.get(hexlify(t).decode('ascii'))
+ if idelem is None:
+ dec = '(UNKNOWN)'
+ else:
+ dec = '({0.name})'.format(idelem)
+ if len(t) < 20:
+ t = hexlify(t).decode('ascii')
+ else:
+ t = '<skipped {0} bytes>'.format(len(t))
+ print('binary', t, dec)
+ elif elem.valtype == 'uint':
+ print('uint', read_uint(s, length))
+ elif elem.valtype == 'sint':
+ print('sint', read_sint(s, length))
+ elif elem.valtype == 'float':
+ print('float', read_float(s, length))
+ elif elem.valtype == 'skip':
+ read(s, length)
+ else:
+ raise NotImplementedError
+ else:
+ print(depth, 'Unknown element:', elid, 'size:', length)
+ read(s, length)
+ return this_length
+
+def parse_toplevel(s):
+ parse_one(s, 0, None, 1 << 63)
+
+if sys.argv[1] == '--generate-header':
+ generate_C_header()
+elif sys.argv[1] == '--generate-definitions':
+ generate_C_definitions()
+else:
+ s = open(sys.argv[1], "rb")
+ while 1:
+ start = s.tell()
+ try:
+ parse_toplevel(s)
+ except EOF:
+ if s.tell() != start:
+ raise Exception("Unexpected end of file")
+ break