diff options
Diffstat (limited to 'libbutl/lz4.cxx')
-rw-r--r-- | libbutl/lz4.cxx | 572 |
1 files changed, 356 insertions, 216 deletions
diff --git a/libbutl/lz4.cxx b/libbutl/lz4.cxx index 6a91a12..2db7af2 100644 --- a/libbutl/lz4.cxx +++ b/libbutl/lz4.cxx @@ -26,7 +26,11 @@ #include <cassert> #include <stdexcept> // invalid_argument, logic_error -#include <libbutl/utility.mxx> // eos() +#include <libbutl/utility.hxx> // eos() + +#if 0 +#include <libbutl/lz4-stream.hxx> +#endif using namespace std; @@ -34,44 +38,6 @@ namespace butl { namespace lz4 { - struct cctx - { - LZ4F_cctx* ctx; - - operator LZ4F_cctx* () const {return ctx;}; - - cctx () - { - if (LZ4F_isError (LZ4F_createCompressionContext (&ctx, LZ4F_VERSION))) - throw bad_alloc (); - } - - ~cctx () - { - LZ4F_errorCode_t e (LZ4F_freeCompressionContext (ctx)); - assert (!LZ4F_isError (e)); - } - }; - - struct dctx - { - LZ4F_dctx* ctx; - - operator LZ4F_dctx* () const {return ctx;}; - - dctx () - { - if (LZ4F_isError (LZ4F_createDecompressionContext (&ctx, LZ4F_VERSION))) - throw bad_alloc (); - } - - ~dctx () - { - LZ4F_errorCode_t e (LZ4F_freeDecompressionContext (ctx)); - assert (!LZ4F_isError (e)); - } - }; - static inline size_t block_size (LZ4F_blockSizeID_t id) { @@ -88,22 +54,22 @@ namespace butl switch (c) { - case LZ4F_ERROR_GENERIC: throw i ("generic error"); - case LZ4F_ERROR_maxBlockSize_invalid: throw i ("invalid block size"); - case LZ4F_ERROR_blockMode_invalid: throw i ("invalid block mode"); - case LZ4F_ERROR_contentChecksumFlag_invalid: throw i ("invalid content checksum flag"); - case LZ4F_ERROR_compressionLevel_invalid: throw i ("invalid compression level"); - case LZ4F_ERROR_headerVersion_wrong: throw i ("wrong header version"); - case LZ4F_ERROR_blockChecksum_invalid: throw i ("invalid block checksum"); - case LZ4F_ERROR_reservedFlag_set: throw i ("reserved flag set"); - case LZ4F_ERROR_srcSize_tooLarge: throw i ("input too large"); - case LZ4F_ERROR_dstMaxSize_tooSmall: throw i ("output too small"); - case LZ4F_ERROR_frameHeader_incomplete: throw i ("incomplete frame header"); - case LZ4F_ERROR_frameType_unknown: throw i ("unknown frame type"); - case LZ4F_ERROR_frameSize_wrong: throw i ("wrong frame size"); - case LZ4F_ERROR_decompressionFailed: throw i ("invalid compressed content"); - case LZ4F_ERROR_headerChecksum_invalid: throw i ("invalid header checksum"); - case LZ4F_ERROR_contentChecksum_invalid: throw i ("invalid content checksum"); + case LZ4F_ERROR_GENERIC: throw i ("generic LZ4 error"); + case LZ4F_ERROR_maxBlockSize_invalid: throw i ("invalid LZ4 block size"); + case LZ4F_ERROR_blockMode_invalid: throw i ("invalid LZ4 block mode"); + case LZ4F_ERROR_contentChecksumFlag_invalid: throw i ("invalid LZ4 content checksum flag"); + case LZ4F_ERROR_compressionLevel_invalid: throw i ("invalid LZ4 compression level"); + case LZ4F_ERROR_headerVersion_wrong: throw i ("wrong LZ4 header version"); + case LZ4F_ERROR_blockChecksum_invalid: throw i ("invalid LZ4 block checksum"); + case LZ4F_ERROR_reservedFlag_set: throw i ("reserved LZ4 flag set"); + case LZ4F_ERROR_srcSize_tooLarge: throw i ("LZ4 input too large"); + case LZ4F_ERROR_dstMaxSize_tooSmall: throw i ("LZ4 output too small"); + case LZ4F_ERROR_frameHeader_incomplete: throw i ("incomplete LZ4 frame header"); + case LZ4F_ERROR_frameType_unknown: throw i ("unknown LZ4 frame type"); + case LZ4F_ERROR_frameSize_wrong: throw i ("wrong LZ4 frame size"); + case LZ4F_ERROR_decompressionFailed: throw i ("invalid LZ4 compressed content"); + case LZ4F_ERROR_headerChecksum_invalid: throw i ("invalid LZ4 header checksum"); + case LZ4F_ERROR_contentChecksum_invalid: throw i ("invalid LZ4 content checksum"); case LZ4F_ERROR_allocation_failed: throw bad_alloc (); @@ -132,162 +98,354 @@ namespace butl throw_exception (LZ4F_getErrorCode (r)); } - // Return the compressed size. + // compression // + + compressor:: + ~compressor () + { + if (LZ4F_cctx* ctx = static_cast<LZ4F_cctx*> (ctx_)) + { + LZ4F_errorCode_t e (LZ4F_freeCompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + } + + inline void compressor:: + init_preferences (void* vp) const + { + LZ4F_preferences_t* p (static_cast<LZ4F_preferences_t*> (vp)); + + p->autoFlush = 1; + p->favorDecSpeed = 0; + p->compressionLevel = level_; + p->frameInfo.blockMode = LZ4F_blockLinked; + p->frameInfo.blockSizeID = static_cast<LZ4F_blockSizeID_t> (block_id_); + p->frameInfo.blockChecksumFlag = LZ4F_noBlockChecksum; + p->frameInfo.contentChecksumFlag = LZ4F_contentChecksumEnabled; + p->frameInfo.contentSize = content_size_ + ? static_cast<unsigned long long> (*content_size_) + : 0; + } + + void compressor:: + begin (int level, + int block_id, + optional<uint64_t> content_size) + { + assert (block_id >= 4 && block_id <= 7); + + level_ = level; + block_id_ = block_id; + content_size_ = content_size; + + LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; + init_preferences (&prefs); + + // Input/output buffer capacities. + // + // To be binary compatible with the lz4 utility we have to compress + // files that fit into the block with a single *_compressFrame() call + // instead of *_compressBegin()/*_compressUpdate(). And to determine the + // output buffer capacity we must use *_compressFrameBound() instead of + // *_compressBound(). The problem is, at this stage (before filling the + // input buffer), we don't know which case it will be. + // + // However, in our case (autoFlush=1), *Bound() < *FrameBound() and so + // we can always use the latter at the cost of slight overhead. Also, + // using *FrameBound() allows us to call *Begin() and *Update() without + // flushing the buffer in between (this insight is based on studying the + // implementation of the *Bound() functions). + // + // Actually, we can use content_size (we can get away with much smaller + // buffers for small inputs). We just need to verify the caller is not + // lying to us (failed that, we may end up with strange error like + // insufficient output buffer space). + // + ic = block_size (prefs.frameInfo.blockSizeID); + + if (content_size_ && *content_size_ < ic) + { + // This is nuanced: we need to add an extra byte in order to detect + // EOF. + // + ic = static_cast<size_t> (*content_size_) + 1; + } + + oc = LZ4F_compressFrameBound (ic, &prefs); + + begin_ = true; + } + + void compressor:: + next (bool end) + { + LZ4F_cctx* ctx; + + // Unlike the decompression case below, compression cannot fail due to + // invalid content. So any LZ4F_*() function failure is either due to a + // programming bug or argument inconsistencies (e.g., content size does + // not match actual). + + if (begin_) + { + begin_ = false; + + LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; + init_preferences (&prefs); + + // If we've allocated smaller buffers based on content_size_, then + // verify the input size matches what's promised. + // + // Note also that LZ4F_compressFrame() does not fail if it doesn't + // match instead replacing it with the actual value. + // + size_t bs (block_size (prefs.frameInfo.blockSizeID)); + if (content_size_ && *content_size_ < bs) + { + if (!end || in != *content_size_) + throw_exception (LZ4F_ERROR_frameSize_wrong); + } + + // Must be < for lz4 compatibility (see EOF nuance above for the + // likely reason). + // + if (end && in < bs) + { + on = LZ4F_compressFrame (ob, oc, ib, in, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + in = 0; // All consumed. + return; + } + else + { + if (LZ4F_isError (LZ4F_createCompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); + + ctx_ = ctx; + + // Write the header. + // + on = LZ4F_compressBegin (ctx, ob, oc, &prefs); + if (LZ4F_isError (on)) + throw_exception (on); + + // Fall through. + } + } + else + { + ctx = static_cast<LZ4F_cctx*> (ctx_); + on = 0; + } + + size_t n; + + if (in != 0) + { + n = LZ4F_compressUpdate (ctx, ob + on, oc - on, ib, in, nullptr); + if (LZ4F_isError (n)) + throw_exception (n); + + in = 0; // All consumed. + on += n; + } + + // Write the end marker. + // + if (end) + { + // Note that this call also verifies specified and actual content + // sizes match. + // + n = LZ4F_compressEnd (ctx, ob + on, oc - on, nullptr); + if (LZ4F_isError (n)) + throw_exception (n); + + on += n; + } + } + uint64_t compress (ofdstream& os, ifdstream& is, int level, int block_id, optional<uint64_t> content_size) { - assert (block_id >= 4 && block_id <= 7); +#if 0 + char buf[1024 * 3 + 7]; + ostream cos (os, level, block_id, content_size); - LZ4F_preferences_t prefs = LZ4F_INIT_PREFERENCES; - prefs.autoFlush = 1; - prefs.favorDecSpeed = 0; - prefs.compressionLevel = level; - prefs.frameInfo.blockMode = LZ4F_blockLinked; - prefs.frameInfo.blockSizeID = static_cast<LZ4F_blockSizeID_t> (block_id); - prefs.frameInfo.blockChecksumFlag = LZ4F_noBlockChecksum; - prefs.frameInfo.contentChecksumFlag = LZ4F_contentChecksumEnabled; - prefs.frameInfo.contentSize = - content_size ? static_cast<unsigned long long> (*content_size) : 0; + for (bool e (false); !e; ) + { + e = eof (is.read (buf, sizeof (buf))); + cos.write (buf, is.gcount ()); + //for (streamsize i (0), n (is.gcount ()); i != n; ++i) + // cos.put (buf[i]); + } - // Input/output buffer capacities. + cos.close (); + return content_size ? *content_size : 0; +#else + compressor c; + + // Input/output buffer guards. // - size_t ic (block_size (prefs.frameInfo.blockSizeID)); - size_t oc; + unique_ptr<char[]> ibg; + unique_ptr<char[]> obg; - // Input/output buffers. + // First determine required buffer capacities. // - unique_ptr<char[]> ibg (new char[ic]); char* ib (ibg.get ()); - unique_ptr<char[]> obg; char* ob; + c.begin (level, block_id, content_size); + + ibg.reset ((c.ib = new char[c.ic])); + obg.reset ((c.ob = new char[c.oc])); - // Read into the input buffer returning the number of bytes read and - // updating the total read and the eof flag. + // Read into the input buffer updating the eof flag. // // Note that we could try to do direct fd read/write but that would // complicate things quite a bit (error handling, stream state, etc). // - uint64_t it (0); bool eof (false); - auto read = [&is, ib, ic, &it, &eof] () -> size_t + auto read = [&is, &c, &eof] () { - eof = butl::eof (is.read (ib, ic)); - size_t n (static_cast<size_t> (is.gcount ())); - it += n; - return n; + eof = butl::eof (is.read (c.ib, c.ic)); + c.in = static_cast<size_t> (is.gcount ()); }; - // Write the specified number of bytes from the output buffer updating - // the total written. + // Write from the output buffer updating the total written. // uint64_t ot (0); - auto write = [&os, &ob, &ot] (size_t n) + auto write = [&os, &c, &ot] () { - os.write (ob, static_cast<streamsize> (n)); - ot += n; + os.write (c.ob, static_cast<streamsize> (c.on)); + ot += c.on; }; - // Unlike the decompression case below, compression cannot fail due to - // invalid content. So any LZ4F_*() function failure is either due to a - // programming bug or argument inconsistencies (e.g., content size does - // not match actual). - - // To be binary compatible with the lz4 utility we have to compress - // files that fit into the block with a single LZ4F_compressFrame() - // call. + // Keep reading, compressing, and writing chunks of content. // - size_t in (read ()); - size_t on; - - if (eof && in < ic) // Should be really <= but that's not lz4-compatible. + while (!eof) { - oc = LZ4F_compressFrameBound (in, &prefs); - obg.reset ((ob = new char[oc])); - - on = LZ4F_compressFrame (ob, oc, ib, in, &prefs); - if (LZ4F_isError (on)) - throw_exception (on); + read (); - write (on); + c.next (eof); - // Verify specified and actual content sizes match. - // - // LZ4F_compressFrame() does not fail if it doesn't match instead - // replacing it with the actual value. - // - if (content_size && *content_size != it) - throw_exception (LZ4F_ERROR_frameSize_wrong); + if (c.on != 0) // next() may just buffer the data. + write (); } - else - { - cctx ctx; - oc = LZ4F_compressBound (ic, &prefs); - obg.reset ((ob = new char[oc])); + return ot; +#endif + } - // Write the header. - // - on = LZ4F_compressBegin (ctx, ob, oc, &prefs); - if (LZ4F_isError (on)) - throw_exception (on); + // decompression + // - write (on); + static_assert (sizeof (decompressor::hb) == LZ4F_HEADER_SIZE_MAX, + "LZ4 header size mismatch"); - // Keep compressing, writing, and reading chunks of content. - // - for (;;) - { - on = LZ4F_compressUpdate (ctx, ob, oc, ib, in, nullptr); - if (LZ4F_isError (on)) - throw_exception (on); + decompressor:: + ~decompressor () + { + if (LZ4F_dctx* ctx = static_cast<LZ4F_dctx*> (ctx_)) + { + LZ4F_errorCode_t e (LZ4F_freeDecompressionContext (ctx)); + assert (!LZ4F_isError (e)); + } + } - if (on != 0) // LZ4F_compressUpdate() may just buffer the data. - write (on); + size_t decompressor:: + begin (optional<uint64_t>* content_size) + { + LZ4F_dctx* ctx; - if (eof) - break; + if (LZ4F_isError (LZ4F_createDecompressionContext (&ctx, LZ4F_VERSION))) + throw bad_alloc (); - in = read (); - } + ctx_ = ctx; - // Write the end marker. - // - // Note that this call also verifies specified and actual content - // sizes match. - // - on = LZ4F_compressEnd (ctx, ob, oc, nullptr); - if (LZ4F_isError (on)) - throw_exception (on); + LZ4F_frameInfo_t info = LZ4F_INIT_FRAMEINFO; - write (on); + // Input hint and end as signalled by the LZ4F_*() functions. + // + size_t h, e; + + h = LZ4F_getFrameInfo (ctx, &info, hb, &(e = hn)); + if (LZ4F_isError (h)) + throw_exception (h); + + if (content_size != nullptr) + { + if (info.contentSize != 0) + *content_size = static_cast<uint64_t> (info.contentSize); + else + *content_size = nullopt; } - return ot; + // Use the block size for the output buffer capacity and compressed + // bound plus the header size for the input. The expectation is that + // LZ4F_decompress() should never hint for more than that. + // + oc = block_size (info.blockSizeID); + ic = LZ4F_compressBound (oc, nullptr) + LZ4F_BLOCK_HEADER_SIZE; + + assert (h <= ic); + + // Move over whatever is left in the header buffer to be beginning. + // + hn -= e; + memmove (hb, hb + e, hn); + + return h; } - uint64_t - decompress (ofdstream& os, ifdstream& is) + size_t decompressor:: + next () { - // The LZ4F_*() decompression functions return a hint of how much data - // they want on the next call. So the plan is to allocate the input - // buffer large enough to hold anything that can be asked for and then - // fill it in in the asked chunks. This way we avoid having to shift the - // unread data, etc. + LZ4F_dctx* ctx (static_cast<LZ4F_dctx*> (ctx_)); + + size_t h, e; + + // Note that LZ4F_decompress() verifies specified and actual content + // sizes match (similar to compression). // - dctx ctx; + h = LZ4F_decompress (ctx, ob, &(on = oc), ib, &(e = in), nullptr); + if (LZ4F_isError (h)) + throw_exception (h); - // Input/output buffer capacities and sizes. + // We expect LZ4F_decompress() to consume what it asked for. // - size_t ic, oc; - size_t in, on; + assert (e == in && h <= ic); + in = 0; // All consumed. - // Input/output buffers. + return h; + } + + uint64_t + decompress (ofdstream& os, ifdstream& is) + { + // Write the specified number of bytes from the output buffer updating + // the total written. // - unique_ptr<char[]> ibg; char* ib; - unique_ptr<char[]> obg; char* ob; + uint64_t ot (0); + auto write = [&os, &ot] (char* b, size_t n) + { + os.write (b, static_cast<streamsize> (n)); + ot += n; + }; +#if 0 + char buf[1024 * 3 + 7]; + istream dis (is, true, istream::badbit); + + for (bool e (false); !e; ) + { + e = eof (dis.read (buf, sizeof (buf))); + write (buf, static_cast<size_t> (dis.gcount ())); + } +#else // Read into the specified buffer returning the number of bytes read and // updating the eof flag. // @@ -305,78 +463,60 @@ namespace butl return n; }; - // Write the specified number of bytes from the output buffer updating - // the total written. - // - uint64_t ot (0); - auto write = [&os, &ob, &ot] (size_t n) - { - os.write (ob, static_cast<streamsize> (n)); - ot += n; - }; + decompressor d; - // Input hint and end as signalled by the LZ4F_*() functions. + // Input/output buffer guards. // - size_t ih, ie; + unique_ptr<char[]> ibg; + unique_ptr<char[]> obg; - // Read the header. - // - LZ4F_frameInfo_t info = LZ4F_INIT_FRAMEINFO; - { - char hb[LZ4F_HEADER_SIZE_MAX]; - in = read (hb, sizeof (hb)); - - ih = LZ4F_getFrameInfo (ctx, &info, hb, &(ie = in)); - if (LZ4F_isError (ih)) - throw_exception (ih); + size_t h; // Input hint. - // Use the block size for the output buffer capacity and compressed - // bound plus the header size for the input. The expectation is that - // LZ4F_decompress() should never hint for more than that. - // - oc = block_size (info.blockSizeID); - ic = LZ4F_compressBound (oc, nullptr) + LZ4F_BLOCK_HEADER_SIZE; + // First read in the header and allocate the buffers. + // + // What if we hit EOF here? And could begin() return 0? Turns out the + // answer to both questions is yes: 0-byte content compresses to 15 + // bytes (with or without content size; 1-byte -- to 20/28 bytes). We + // can ignore EOF here since an attempt to read more will result in + // another EOF. And code below is prepared to handle 0 initial hint. + // + // @@ We could end up leaving some of the input content from the + // header in the input buffer which the caller will have to way + // of using/detecting. + // + d.hn = read (d.hb, sizeof (d.hb)); + h = d.begin (); - assert (ih <= ic); + ibg.reset ((d.ib = new char[d.ic])); + obg.reset ((d.ob = new char[d.oc])); - ibg.reset ((ib = new char[ic])); - obg.reset ((ob = new char[oc])); + // Copy over whatever is left in the header buffer and read up to + // the hinted size. + // + memcpy (d.ib, d.hb, (d.in = d.hn)); - // Copy over whatever is left in the header buffer and read up to - // the hinted size. - // - in -= ie; - memcpy (ib, hb + ie, in); - in += read (ib + in, ih - in); - } + if (h > d.in) + d.in += read (d.ib + d.in, h - d.in); // Keep decompressing, writing, and reading chunks of compressed // content. // - // Note that LZ4F_decompress() verifies specified and actual content - // sizes match (similar to compression). - // - for (;;) + while (h != 0) { - ih = LZ4F_decompress (ctx, ob, &(on = oc), ib, &(ie = in), nullptr); - if (LZ4F_isError (ih)) - throw_exception (ih); + h = d.next (); - // We expect LZ4F_decompress() to consume what it asked for. - // - assert (ie == in); + if (d.on != 0) // next() may just buffer the data. + write (d.ob, d.on); - write (on); - - if (ih == 0) - break; - - if (eof) - throw invalid_argument ("incomplete compressed content"); + if (h != 0) + { + if (eof) + throw invalid_argument ("incomplete LZ4 compressed content"); - assert (ih <= ic); - in = read (ib, ih); + d.in = read (d.ib, h); + } } +#endif return ot; } |