shithub: opus

--- a/libentcode/Makefile.am

+++ b/libentcode/Makefile.am

@@ -2,7 +2,7 @@

 METASOURCES = AUTO

 lib_LTLIBRARIES = libentcode.la

 libentcode_la_SOURCES = bitrdec.c bitree.c bitrenc.c ecintrin.h entcode.c \

-	entdec.c entenc.c laplace.c mfrngdec.c mfrngenc.c probdec.c probenc.c probmod.c

+	entdec.c entenc.c laplace.c rangedec.c rangeenc.c probdec.c probenc.c probmod.c

 bin_PROGRAMS = ectest

 ectest_SOURCES = ectest.c

 ectest_LDADD = $(top_builddir)/libentcode/libentcode.la

--- /dev/null

+++ b/libentcode/rangedec.c

@@ -1,0 +1,243 @@

+#include <stddef.h>

+#include "entdec.h"

+#include "mfrngcod.h"

+/*A multiply-free range decoder.

+  This is an entropy decoder based upon \cite{Mar79}, which is itself a

+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.

+  It is very similar to arithmetic encoding, except that encoding is done with

+   digits in any base, instead of with bits, and so it is faster when using

+   larger bases (i.e.: a byte).

+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$

+   is the base, longer than the theoretical optimum, but to my knowledge there

+   is no published justification for this claim.

+  This only seems true when using near-infinite precision arithmetic so that

+   the process is carried out with no rounding errors.

+  IBM (the author's employer) never sought to patent the idea, and to my

+   knowledge the algorithm is unencumbered by any patents, though its

+   performance is very competitive with proprietary arithmetic coding.

+  The two are based on very similar ideas, however.

+  An excellent description of implementation details is available at

+   http://www.arturocampos.com/ac_range.html

+  A recent work \cite{MNW98} which proposes several changes to arithmetic

+   encoding for efficiency actually re-discovers many of the principles

+   behind range encoding, and presents a good theoretical analysis of them.

+  The coder is made multiply-free by replacing the standard multiply/divide

+   used to partition the current interval according to the total frequency

+   count.

+  The new partition function scales the count so that it differs from the size

+   of the interval by no more than a factor of two and then assigns each symbol

+   one or two code words in the interval.

+  For details see \cite{SM98}.

+  This coder also handles the end of the stream in a slightly more graceful

+   fashion than most arithmetic or range coders.

+  Once the final symbol has been encoded, the coder selects the code word with

+   the shortest number of bits that still falls within the final interval.

+  This method is not novel.

+  Here, by the length of the code word, we refer to the number of bits until

+   its final 1.

+  Any trailing zeros may be discarded, since the encoder, once it runs out of

+   input, will pad its buffer with zeros.

+  But this means that no encoded stream would ever have any zero bytes at the

+   end.

+  Since there are some coded representations we cannot produce, it implies that

+   there is still some redundancy in the stream.

+  In this case, we can pick a special byte value, RSV1, and should the stream

+   end in a sequence of zeros, followed by the RSV1 byte, we can code the

+   zeros, and discard the RSV1 byte.

+  The decoder, knowing that the encoder would never produce a sequence of zeros

+   at the end, would then know to add in the RSV1 byte if it observed it.

+  Now, the encoder would never produce a stream that ended in a sequence of

+   zeros followed by a RSV1 byte.

+  So, if the stream ends in a non-empty sequence of zeros, followed by any

+   positive number of RSV1 bytes, the last RSV1 byte is discarded.

+  The decoder, if it encounters a stream that ends in non-empty sequence of

+   zeros followed by any non-negative number of RSV1 bytes, adds an additional

+   RSV1 byte to the stream.

+  With this strategy, every possible sequence of input bytes is transformed to

+   one that could actually be produced by the encoder.

+  The only question is what non-zero value to use for RSV1.

+  We select 0x80, since it has the nice property of producing the shortest

+   possible byte streams when using our strategy for selecting a number within

+   the final interval to encode.

+  Clearly if the shortest possible code word that falls within the interval has

+   its last one bit as the most significant bit of the final byte, and the

+   previous bytes were a non-empty sequence of zeros followed by a non-negative

+   number of 0x80 bytes, then the last byte would be discarded.

+  If the shortest code word is not so formed, then no other code word in the

+   interval would result in any more bytes being discarded.

+  Any longer code word would have an additional one bit somewhere, and so would

+   require at a minimum that that byte would be coded.

+  If the shortest code word has a 1 before the final one that is preventing the

+   stream from ending in a non-empty sequence of zeros followed by a

+   non-negative number of 0x80's, then there is no code word of the same length

+   which contains that bit as a zero.

+  If there were, then we could simply leave that bit a 1, and drop all the bits

+   after it without leaving the interval, thus producing a shorter code word.

+  In this case, RSV1 can only drop 1 bit off the final stream.

+  Other choices could lead to savings of up to 8 bits for particular streams,

+   but this would produce the odd situation that a stream with more non-zero

+   bits is actually encoded in fewer bytes.

+  @PHDTHESIS{Pas76,

+    author="Richard Clark Pasco",

+    title="Sorce coding algorithms for fast data compression",

+    school="Dept. of Electrical Engineering, Stanford University",

+    address="Stanford, CA",

+    month=May,

+    year=1976

+  }

+  @INPROCEEDINGS{Mar79,

+   author="Martin, G.N.N.",

+   title="Range encoding: an algorithm for removing redundancy from a digitised

+    message",

+   booktitle="Video & Data Recording Conference",

+   year=1979,

+   address="Southampton",

+   month=Jul

+  }

+  @ARTICLE{MNW98,

+   author="Alistair Moffat and Radford Neal and Ian H. Witten",

+   title="Arithmetic Coding Revisited",

+   journal="{ACM} Transactions on Information Systems",

+   year=1998,

+   volume=16,

+   number=3,

+   pages="256--294",

+   month=Jul,

+   URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/"

+  }

+  @INPROCEEDINGS{SM98,

+   author="Lang Stuiver and Alistair Moffat",

+   title="Piecewise Integer Mapping for Arithmetic Coding",

+   booktitle="Proceedings of the {IEEE} Data Compression Conference",

+   pages="1--10",

+   address="Snowbird, UT",

+   month="Mar./Apr.",

+   year=1998

+  }*/

+/*Gets the next byte of input.

+  After all the bytes in the current packet have been consumed, and the extra

+   end code returned if needed, this function will continue to return zero each

+   time it is called.

+  Return: The next byte of input.*/

+static int ec_dec_in(ec_dec *_this){

+  int ret;

+  ret=ec_byte_read1(_this->buf);

+  if(ret<0){

+    unsigned char *buf;

+    long           bytes;

+    bytes=ec_byte_bytes(_this->buf);

+    buf=ec_byte_get_buffer(_this->buf);

+    /*Breaking abstraction: don't do this at home, kids.*/

+    if(_this->buf->storage==bytes){

+      ec_byte_adv1(_this->buf);

+      if(bytes>0){

+        unsigned char *p;

+        p=buf+bytes;

+        /*If we end in a string of 0 or more EC_FOF_RSV1 bytes preceded by a

+           zero, return an extra EC_FOF_RSV1 byte.*/

+        do p--;

+        while(p>buf&&p[0]==EC_FOF_RSV1);

+        if(!p[0])return EC_FOF_RSV1;

+      }

+    }

+    return 0;

+  }

+  else return ret;

+}

+/*Normalizes the contents of low and rng so that rng is contained in the

+   high-order symbol of low.*/

+static void ec_dec_normalize(ec_dec *_this){

+  /*If the range is too small, rescale it and input some bits.*/

+  while(_this->rng<=EC_CODE_BOT){

+    int sym;

+    _this->rng<<=EC_SYM_BITS;

+    /*Use up the remaining bits from our last symbol.*/

+    sym=_this->rem<<EC_CODE_EXTRA&EC_SYM_MAX;

+    /*Read the next value from the input.*/

+    _this->rem=ec_dec_in(_this);

+    /*Take the rest of the bits we need from this new symbol.*/

+    sym|=_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA;

+    _this->dif=(_this->dif<<EC_SYM_BITS)-sym&EC_CODE_MASK;

+    /*dif can never be larger than EC_CODE_TOP.

+      This is equivalent to the slightly more readable:

+      if(_this->dif>EC_CODE_TOP)_this->dif-=EC_CODE_TOP;*/

+    _this->dif^=(_this->dif&_this->dif-1)&EC_CODE_TOP;

+  }

+}

+void ec_dec_init(ec_dec *_this,ec_byte_buffer *_buf){

+  _this->buf=_buf;

+  _this->rem=ec_dec_in(_this);

+  _this->rng=1U<<EC_CODE_EXTRA;

+  _this->dif=_this->rng-(_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA);

+  /*Normalize the interval.*/

+  ec_dec_normalize(_this);

+}

+unsigned ec_decode(ec_dec *_this,unsigned _ft){

+  unsigned s;

+  _this->nrm=_this->rng/_ft;

+  s=(unsigned)((_this->dif-1)/_this->nrm);

+  return _ft-EC_MINI(s+1,_ft);

+}

+void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft){

+  ec_uint32 s;

+  s=_this->nrm*(_ft-_fh);

+  _this->dif-=s;

+  _this->rng=_fl>0?_this->nrm*(_fh-_fl):_this->rng-s;

+  ec_dec_normalize(_this);

+}

+#if 0

+int ec_dec_done(ec_dec *_this){

+  unsigned low;

+  int      ret;

+  /*Check to make sure we've used all the input bytes.

+    This ensures that no more ones would ever be inserted into the decoder.*/

+  if(_this->buf->ptr-ec_byte_get_buffer(_this->buf)<=

+   ec_byte_bytes(_this->buf)){

+    return 0;

+  }

+  /*We compute the smallest finitely odd fraction that fits inside the current

+     range, and write that to the stream.

+    This is guaranteed to yield the smallest possible encoding.*/

+  /*TODO: Fix this line, as it is wrong.

+    It doesn't seem worth being able to make this check to do an extra

+     subtraction for every symbol decoded.*/

+  low=/*What we want: _this->top-_this->rng; What we have:*/_this->dif

+  if(low){

+    unsigned end;

+    end=EC_CODE_TOP;

+    /*Ensure that the next free end is in the range.*/

+    if(end-low>=_this->rng){

+      unsigned msk;

+      msk=EC_CODE_TOP-1;

+      do{

+        msk>>=1;

+        end=(low+msk)&~msk|msk+1;

+      }

+      while(end-low>=_this->rng);

+    }

+    /*The remaining input should have been the next free end.*/

+    return end-low!=_this->dif;

+  }

+  return 1;

+}

+#endif

--- /dev/null

+++ b/libentcode/rangeenc.c

@@ -1,0 +1,145 @@

+#include <stddef.h>

+#include "entenc.h"

+#include "mfrngcod.h"

+/*A multiply-free range encoder.

+  See mfrngdec.c and the references for implementation details

+   \cite{Mar79,MNW98,SM98}.

+  @INPROCEEDINGS{Mar79,

+   author="Martin, G.N.N.",

+   title="Range encoding: an algorithm for removing redundancy from a digitised

+    message",

+   booktitle="Video \& Data Recording Conference",

+   year=1979,

+   address="Southampton",

+   month=Jul

+  }

+  @ARTICLE{MNW98,

+   author="Alistair Moffat and Radford Neal and Ian H. Witten",

+   title="Arithmetic Coding Revisited",

+   journal="{ACM} Transactions on Information Systems",

+   year=1998,

+   volume=16,

+   number=3,

+   pages="256--294",

+   month=Jul,

+   URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/"

+  }

+  @INPROCEEDINGS{SM98,

+   author="Lang Stuiver and Alistair Moffat",

+   title="Piecewise Integer Mapping for Arithmetic Coding",

+   booktitle="Proceedings of the {IEEE} Data Compression Conference",

+   pages="1--10",

+   address="Snowbird, UT",

+   month="Mar./Apr.",

+   year=1998

+  }*/

+/*Outputs a symbol, with a carry bit.

+  If there is a potential to propogate a carry over several symbols, they are

+   buffered until it can be determined whether or not an actual carry will

+   occur.

+  If the counter for the buffered symbols overflows, then the range is

+   truncated to force a carry to occur, towards whichever side maximizes the

+   remaining range.*/

+static void ec_enc_carry_out(ec_enc *_this,int _c){

+  if(_c!=EC_SYM_MAX){

+    /*No further carry propogation possible, flush buffer.*/

+    int carry;

+    carry=_c>>EC_SYM_BITS;

+    /*Don't output a byte on the first write.

+      This compare should be taken care of by branch-prediction thereafter.*/

+    if(_this->rem>=0)ec_byte_write1(_this->buf,_this->rem+carry);

+    if(_this->ext>0){

+      unsigned sym;

+      sym=EC_SYM_MAX+carry&EC_SYM_MAX;

+      do ec_byte_write1(_this->buf,sym);

+      while(--(_this->ext)>0);

+    }

+    _this->rem=_c&EC_SYM_MAX;

+  }

+  else _this->ext++;

+}

+static void ec_enc_normalize(ec_enc *_this){

+  /*If the range is too small, output some bits and rescale it.*/

+  while(_this->rng<=EC_CODE_BOT){

+    ec_enc_carry_out(_this,(int)(_this->low>>EC_CODE_SHIFT));

+    /*Move the next-to-high-order symbol into the high-order position.*/

+    _this->low=_this->low<<EC_SYM_BITS&EC_CODE_TOP-1;

+    _this->rng<<=EC_SYM_BITS;

+  }

+}

+void ec_enc_init(ec_enc *_this,ec_byte_buffer *_buf){

+  _this->buf=_buf;

+  _this->rem=-1;

+  _this->ext=0;

+  _this->low=0;

+  _this->rng=EC_CODE_TOP;

+}

+void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){

+  unsigned r;

+  unsigned s;

+  r=_this->rng/_ft;

+  if(_fl>0){

+    s=r*(_ft-_fl);

+    _this->low+=_this->rng-s;

+    _this->rng=r*(_fh-_fl);

+  }

+  else _this->rng-=r*(_ft-_fh);

+  ec_enc_normalize(_this);

+}

+void ec_enc_done(ec_enc *_this){

+  /*We compute the integer in the current interval that has the largest number

+     of trailing zeros, and write that to the stream.

+    This is guaranteed to yield the smallest possible encoding.*/

+  if(_this->low){

+    unsigned end;

+    end=EC_CODE_TOP;

+    /*Ensure that the end value is in the range.*/

+    if(end-_this->low>=_this->rng){

+      unsigned msk;

+      msk=EC_CODE_TOP-1;

+      do{

+        msk>>=1;

+        end=(_this->low+msk)&~msk|msk+1;

+      }

+      while(end-_this->low>=_this->rng);

+    }

+    /*The remaining output is the next free end.*/

+    while(end){

+      ec_enc_carry_out(_this,end>>EC_CODE_SHIFT);

+      end=end<<EC_SYM_BITS&EC_CODE_TOP-1;

+    }

+  }

+  /*If we have a buffered byte...*/

+  if(_this->rem>=0){

+    unsigned char *p;

+    unsigned char *buf;

+    /*Flush it into the output buffer.*/

+    ec_enc_carry_out(_this,0);

+    /*We may be able to drop some redundant bytes from the end.*/

+    buf=ec_byte_get_buffer(_this->buf);

+    p=buf+ec_byte_bytes(_this->buf)-1;

+    /*Strip trailing zeros.*/

+    while(p>=buf&&!p[0])p--;

+    /*Strip one trailing EC_FOF_RSV1 byte if the buffer ends in a string of

+       consecutive EC_FOF_RSV1 bytes preceded by one (or more) zeros.*/

+    if(p>buf&&p[0]==EC_FOF_RSV1){

+      unsigned char *q;

+      q=p;

+      do q--;

+      while(q>buf&&q[0]==EC_FOF_RSV1);

+      if(!q[0])p--;

+    }

+    ec_byte_writetrunc(_this->buf,p+1-buf);

+  }

+}