First version of a support utility to provide generalized compression in

author Reid Spencer <rspencer@reidspencer.com>

Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)

committer Reid Spencer <rspencer@reidspencer.com>

Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)
author Reid Spencer <rspencer@reidspencer.com>
Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)
committer Reid Spencer <rspencer@reidspencer.com>
Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)
diff --git a/include/llvm/Support/Compressor.h b/include/llvm/Support/Compressor.h

new file mode 100644 (file)

index 0000000..eea98eb
--- /dev/null
+++ b/include/llvm/Support/Compressor.h
@@ -0,0 +1,99 @@
+//===- llvm/Support/Compressor.h --------------------------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Reid Spencer and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares the llvm::Compressor class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_COMPRESSOR_H
+#define LLVM_SUPPORT_COMPRESSOR_H
+
+#include <llvm/Support/DataTypes.h>
+
+namespace llvm {
+
+  /// This class provides an abstraction for compressing a block of memory using
+  /// a standard compression utility such as bzip2 or libz. This interface
+  /// allos us to abstraction the notion of compression and deal with alternate
+  /// compression scheme availability depending on the configured platform. This
+  /// facility will always favor a bzip2 implementation if its available.
+  /// Otherwise, libz will be used if its available. If neither zlib nor bzip2
+  /// are available, a very simple algorithm provided by the Compressor class
+  /// will be used The type of compression used can be determined by inspecting 
+  /// the first byte of the compressed output. ASCII values '0', '1', and '2', 
+  /// denote the compression type as given in the Algorithm enumeration below.
+  /// The Compressor is intended for use with memory mapped files where the 
+  /// entire data block to be compressed or decompressed is available in 
+  /// memory. Output, however, can be gathered in repeated calls to a callback.
+  /// @since 1.4
+  /// @brief An abstraction for memory to memory data compression
+  class Compressor {
+    /// @name Types
+    /// @{
+    public:
+      enum Algorithm {
+        COMP_TYPE_SIMPLE = '0',  ///< Use simple but ubiquitous algorithm
+        COMP_TYPE_ZLIB = '1',    ///< Use zlib algorithm, if available
+        COMP_TYPE_BZIP2 = '2',   ///< Use bzip2 algorithm (preferred)
+      };
+
+      /// A callback function type used by the Compressor to get the next chunk 
+      /// of data to which (de)compressed output will be written. This function
+      /// must be written by the caller to provide the buffering of the output
+      /// data.
+      /// @returns 0 for success, 1 for failure
+      /// @throws nothing
+      /// @brief Output callback function type
+      typedef unsigned (OutputDataCallback)(char*& buffer, unsigned& size);
+
+    /// @}
+    /// @name Methods
+    /// @{
+    public:
+      /// This function does the compression work. The block of memory starting
+      /// at \p in and extending for \p size bytes is compressed. The compressed
+      /// output is written to memory blocks returned by the \p cb callback. The
+      /// caller must provide an implementation of the OutputDataCallback
+      /// function type and provide its address as \p cb. Note that the callback
+      /// function will be called as many times as necessary to complete the
+      /// compression of the \p in block but that the total size will generally
+      /// be less than \p size. It is a good idea to provide as large a value to
+      /// the callback's \p size parameter as possible so that fewer calls to
+      /// the callback are made. The \p hint parameter tells the function which
+      /// kind of compression to start with. However, if its not available on
+      /// the platform, the algorithm "falls back" from bzip2 -> zlib -> simple.
+      /// @throws std::string if an error occurs
+      /// @returns the total size of the compressed data
+      /// @brief Compress a block of memory.
+      static uint64_t compress(char* in, unsigned size, OutputDataCallback* cb,
+                               Algorithm hint = COMP_TYPE_BZIP2);
+
+      /// This function does the decompression work. The block of memory
+      /// starting at \p in and extending for \p size bytes is decompressed. The
+      /// decompressed output is written to memory blocks returned by the \p cb
+      /// callback. The caller must provide an implementation of the
+      /// OutputDataCallback function type and provide its address as \p cb.
+      /// Note that the callback function will be called as many times as
+      /// necessary to complete the compression of the \p in block but that the
+      /// total size will generally be greater than \p size. It is a good idea
+      /// to provide as large a value to the callback's \p size parameter as 
+      /// possible so that fewer calls to the callback are made.
+      /// @throws std::string if an error occurs
+      /// @returns the total size of the decompressed data
+      /// @brief Decompress a block of memory.
+      static uint64_t decompress(char *in, unsigned size, 
+                                 OutputDataCallback* cb);
+
+    /// @}
+  };
+}
+
+// vim: sw=2 ai
+
+#endif
diff --git a/lib/Support/Compressor.cpp b/lib/Support/Compressor.cpp

new file mode 100644 (file)

index 0000000..972a85d
--- /dev/null
+++ b/lib/Support/Compressor.cpp
@@ -0,0 +1,526 @@
+//===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Reid Spencer and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::Compressor class, an abstraction for memory
+// block compression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/Compressor.h"
+#include "llvm/ADT/StringExtras.h"
+#include <cassert>
+#include <string>
+
+#ifdef HAVE_BZIP2
+#include <bzlib.h>
+#endif
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
+#ifndef HAVE_BZIP2
+# ifndef HAVE_ZLIB
+#  warning No compression library is available!!
+# endif
+#endif
+
+namespace {
+
+inline int getdata(char*& buffer, unsigned& size, 
+                   llvm::Compressor::OutputDataCallback* cb) {
+  buffer = 0;
+  size = 0;
+  int result = (*cb)(buffer, size);
+  assert(buffer != 0 && "Invalid result from Compressor callback");
+  assert(size != 0 && "Invalid result from Compressor callback");
+  return result;
+}
+
+//===----------------------------------------------------------------------===//
+//=== RLCOMP - very simple run length compression scheme
+//===   The code below transforms the input into blocks that are either 
+//===   compressed or not. Each block starts with a header byte that provides
+//===   the length of the block. Values < 128 are uncompressed, values >128
+//===   are compressed. The value 128 is illegal. Currently, the algorithm is
+//===   not completed and is #if'd out.
+//===----------------------------------------------------------------------===//
+
+enum {
+  MAX_RLCOMP_OUT = 32768
+};
+
+struct RLCOMP_stream {
+  // User provided fields
+  char* next_in;
+  unsigned avail_in;
+  char* next_out;
+  unsigned avail_out;
+
+  // Information fields
+  uint64_t output_count; // Total count of output bytes
+
+#if 0
+  // Algorithm fields
+  unsigned block_len;    // Length of current block
+  unsigned compression;  // State of compression 0=no, 1=yes, 2=indeterminate
+  char buffer[128];      // compression buffer (not used for decompress)
+  unsigned buflen;       // bytes in compression buffer
+  bool pending;          // is data pending to be written
+  char pending_data;     // the pending data
+  unsigned clen;         // length of the compressed block
+#endif
+};
+
+void RLCOMP_init(RLCOMP_stream* s) {
+  s->output_count = 0;
+#if 0
+  s->block_len = 0;
+  s->compression = 2;
+  s->buflen = 0;
+  s->pending = false;
+  s->pending_data = 0;
+  s->clen = 0;
+#endif
+}
+
+inline bool RLCOMP_getchar(RLCOMP_stream* s, unsigned& data) {
+#if 0
+  if (s->avail_in) {
+    data = *s->next_in++;
+    s->avail_in--;
+    return true;
+  }
+#endif
+  return false;
+}
+
+inline bool RLCOMP_putchar(RLCOMP_stream* s, unsigned data) {
+#if 0
+  if (s->avail_out) {
+    *s->next_out++ = data;
+    s->avail_out--;
+    s->output_count++;
+    return true;
+  } else {
+    s->pending = true;
+    s->pending_data = data;
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
+bool RLCOMP_compress(RLCOMP_stream* s) {
+  assert(s && "Invalid RLCOMP_stream");
+  assert(s->next_in != 0);
+  assert(s->next_out != 0);
+  assert(s->avail_in >= 1);
+  assert(s->avail_out >= 1);
+
+#if 0
+
+  // Handle pending data from the last time in
+  if (s->pending) {
+    RLCOMP_putchar(s,s->pending_data);
+    s->pending = false;
+  }
+
+  unsigned c = 0;
+  unsigned lastc = 0;
+  // Handle the degenerate len=1 case
+  if (!RLCOMP_getchar(s,lastc)) {
+    RLCOMP_putchar(s,1);
+    return RLCOMP_putchar(s,lastc);
+  }
+
+  while (RLCOMP_getchar(s,c)) {
+    switch(s->compression) {
+      case 0:
+        if (lastc == c) {
+          s->compression = 1;
+          s->clen = 2 ;
+        } else {
+          if (!RLCOMP_putchar(s, c))
+            return false;
+        }
+        break;
+
+      case 1:
+        if (lastc != c) {
+          s->compression = 2;
+          if (!RLCOMP_putchar(s, s->clen))
+            return false;
+        } else {
+          s->clen++;
+        }
+        break;
+
+      case 2:
+        break;
+    }
+    lastc = c;
+  }
+#endif
+  if (s->avail_out >= s->avail_in) {
+    ::memcpy(s->next_out, s->next_in, s->avail_in);
+    s->output_count += s->avail_in;
+    s->avail_out -= s->avail_in;
+    s->next_in += s->avail_in;
+    s->avail_in = 0;
+    return true;
+  } else {
+    ::memcpy(s->next_out, s->next_in, s->avail_out);
+    s->output_count += s->avail_out;
+    s->avail_in -= s->avail_out;
+    s->next_in += s->avail_out;
+    s->avail_out = 0;
+    return false;
+  }
+}
+
+bool RLCOMP_decompress(RLCOMP_stream* s) {
+  assert(s && "Invalid RLCOMP_stream");
+  assert(s->next_in != 0);
+  assert(s->next_out != 0);
+  assert(s->avail_in >= 1);
+  assert(s->avail_out >= 1);
+
+#if 0
+  unsigned c = 0;
+  while (RLCOMP_getchar(s,c)) {
+    switch(s->compression) {
+      case 0: // This is not a compressed block
+        s->block_len--;
+        if (!RLCOMP_putchar(s,c))
+          return false;
+        break;
+
+      case 1: // This is a comperssed block
+        while (s->block_len-- > 0)
+          if (!RLCOMP_putchar(s,c))
+            return false;
+        break;
+
+      case 2: // This is the length field
+        if (c < 128) {
+          s->compression = 0;
+          s->block_len = c;
+        } else {
+          s->compression = 1;
+          s->block_len = c - 128;
+        }
+        continue;
+
+      default: // oops!
+        throw std::string("Invalid compression state");
+    }
+    if (s->block_len <= 0)
+      s->compression = 2;
+  }
+
+  if (s->repeat > 0)
+    throw std::string("Invalid compression state");
+#endif
+  if (s->avail_out >= s->avail_in) {
+    ::memcpy(s->next_out, s->next_in, s->avail_in);
+    s->output_count += s->avail_in;
+    s->avail_out -= s->avail_in;
+    s->next_in += s->avail_in;
+    s->avail_in = 0;
+    return true;
+  } else {
+    ::memcpy(s->next_out, s->next_in, s->avail_out);
+    s->output_count += s->avail_out;
+    s->avail_in -= s->avail_out;
+    s->next_in += s->avail_out;
+    s->avail_out = 0;
+    return false;
+  }
+}
+
+void RLCOMP_end(RLCOMP_stream* strm) {
+}
+
+}
+
+namespace llvm {
+
+// Compress in one of three ways
+uint64_t Compressor::compress(char* in, unsigned size, 
+                              OutputDataCallback* cb, Algorithm hint) {
+  assert(in && "Can't compress null buffer");
+  assert(size && "Can't compress empty buffer");
+  assert(cb && "Can't compress without a callback function");
+
+  uint64_t result = 0;
+
+  switch (hint) {
+    case COMP_TYPE_BZIP2: {
+#if defined(HAVE_BZIP2)
+      // Set up the bz_stream
+      bz_stream bzdata;
+      bzdata.bzalloc = 0;
+      bzdata.bzfree = 0;
+      bzdata.opaque = 0;
+      bzdata.next_in = in;
+      bzdata.avail_in = size;
+      bzdata.next_out = 0;
+      bzdata.avail_out = 0;
+      switch ( BZ2_bzCompressInit(&bzdata, 9, 0, 0) ) {
+        case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
+        case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
+        case BZ_MEM_ERROR:    throw std::string("Out of memory");
+        case BZ_OK:
+        default:
+          break;
+      }
+
+      // Get a block of memory
+      if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) {
+        BZ2_bzCompressEnd(&bzdata);
+        throw std::string("Can't allocate output buffer");
+      }
+
+      // Put compression code in first byte
+      (*bzdata.next_out++) = COMP_TYPE_BZIP2;
+      bzdata.avail_out--;
+
+      // Compress it
+      int bzerr = BZ_FINISH_OK;
+      while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
+        if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) {
+          BZ2_bzCompressEnd(&bzdata);
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+      switch (bzerr) {
+        case BZ_SEQUENCE_ERROR:
+        case BZ_PARAM_ERROR: throw std::string("Param/Sequence error");
+        case BZ_FINISH_OK:
+        case BZ_STREAM_END: break;
+        default: throw std::string("Oops: ") + utostr(unsigned(bzerr));
+      }
+
+      // Finish
+      result = (static_cast<uint64_t>(bzdata.total_out_hi32) << 32) |
+          bzdata.total_out_lo32 + 1;
+
+      BZ2_bzCompressEnd(&bzdata);
+      break;
+#else
+      // FALL THROUGH
+#endif
+    }
+
+    case COMP_TYPE_ZLIB: {
+#if defined(HAVE_ZLIB)
+      z_stream zdata;
+      zdata.zalloc = Z_NULL;
+      zdata.zfree = Z_NULL;
+      zdata.opaque = Z_NULL;
+      zdata.next_in = reinterpret_cast<Bytef*>(in);
+      zdata.avail_in = size;
+      if (Z_OK != deflateInit(&zdata,Z_BEST_COMPRESSION))
+        throw std::string(zdata.msg ? zdata.msg : "zlib error");
+
+      if (0 != getdata((char*&)(zdata.next_out), zdata.avail_out,cb)) {
+        deflateEnd(&zdata);
+        throw std::string("Can't allocate output buffer");
+      }
+
+      (*zdata.next_out++) = COMP_TYPE_ZLIB;
+      zdata.avail_out--;
+
+      int flush = 0;
+      while ( Z_OK == deflate(&zdata,0) && zdata.avail_out == 0) {
+        if (0 != getdata((char*&)zdata.next_out, zdata.avail_out, cb)) {
+          deflateEnd(&zdata);
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      while ( Z_STREAM_END != deflate(&zdata, Z_FINISH)) {
+        if (0 != getdata((char*&)zdata.next_out, zdata.avail_out, cb)) {
+          deflateEnd(&zdata);
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      result = static_cast<uint64_t>(zdata.total_out) + 1;
+      deflateEnd(&zdata);
+      break;
+
+#else
+    // FALL THROUGH
+#endif
+    }
+
+    case COMP_TYPE_SIMPLE: {
+      RLCOMP_stream sdata;
+      sdata.next_in = in;
+      sdata.avail_in = size;
+      RLCOMP_init(&sdata);
+
+      if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) {
+        throw std::string("Can't allocate output buffer");
+      }
+
+      *(sdata.next_out++) = COMP_TYPE_SIMPLE;
+      sdata.avail_out--;
+
+      while (!RLCOMP_compress(&sdata)) {
+        if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) {
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      result = sdata.output_count + 1;
+      RLCOMP_end(&sdata);
+      break;
+    }
+    default:
+      throw std::string("Invalid compression type hint");
+  }
+  return result;
+}
+
+// Decompress in one of three ways
+uint64_t Compressor::decompress(char *in, unsigned size, 
+                                OutputDataCallback* cb) {
+  assert(in && "Can't decompress null buffer");
+  assert(size > 1 && "Can't decompress empty buffer");
+  assert(cb && "Can't decompress without a callback function");
+
+  uint64_t result = 0;
+
+  switch (*in++) {
+    case COMP_TYPE_BZIP2: {
+#if !defined(HAVE_BZIP2)
+      throw std::string("Can't decompress BZIP2 data");
+#else
+      // Set up the bz_stream
+      bz_stream bzdata;
+      bzdata.bzalloc = 0;
+      bzdata.bzfree = 0;
+      bzdata.opaque = 0;
+      bzdata.next_in = in;
+      bzdata.avail_in = size - 1;
+      bzdata.next_out = 0;
+      bzdata.avail_out = 0;
+      switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
+        case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
+        case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
+        case BZ_MEM_ERROR:    throw std::string("Out of memory");
+        case BZ_OK:
+        default:
+          break;
+      }
+
+      // Get a block of memory
+      if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) {
+        BZ2_bzDecompressEnd(&bzdata);
+        throw std::string("Can't allocate output buffer");
+      }
+
+      // Decompress it
+      int bzerr = BZ_OK;
+      while (BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata))) {
+        if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb)) {
+          BZ2_bzDecompressEnd(&bzdata);
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      switch (bzerr) {
+        case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
+        case BZ_MEM_ERROR:    throw std::string("Out of memory");
+        case BZ_DATA_ERROR:   throw std::string("Data integrity error");
+        case BZ_DATA_ERROR_MAGIC:throw std::string("Data is not BZIP2");
+        default: throw("Ooops");
+        case BZ_STREAM_END:
+          break;
+      }
+
+      // Finish
+      result = (static_cast<uint64_t>(bzdata.total_out_hi32) << 32) |
+        bzdata.total_out_lo32;
+      BZ2_bzDecompressEnd(&bzdata);
+      break;
+    }
+#endif
+
+    case COMP_TYPE_ZLIB: {
+#if !defined(HAVE_ZLIB)
+      throw std::string("Can't decompress ZLIB data");
+#else
+      z_stream zdata;
+      zdata.zalloc = Z_NULL;
+      zdata.zfree = Z_NULL;
+      zdata.opaque = Z_NULL;
+      zdata.next_in = reinterpret_cast<Bytef*>(in);
+      zdata.avail_in = size - 1;
+      if ( Z_OK != inflateInit(&zdata))
+        throw std::string(zdata.msg ? zdata.msg : "zlib error");
+
+      if (0 != getdata((char*&)zdata.next_out, zdata.avail_out,cb)) {
+        inflateEnd(&zdata);
+        throw std::string("Can't allocate output buffer");
+      }
+
+      int zerr = Z_OK;
+      while (Z_OK == (zerr = inflate(&zdata,0))) {
+        if (0 != getdata((char*&)zdata.next_out, zdata.avail_out,cb)) {
+          inflateEnd(&zdata);
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      if (zerr != Z_STREAM_END)
+        throw std::string(zdata.msg?zdata.msg:"zlib error");
+
+      result = static_cast<uint64_t>(zdata.total_out);
+      inflateEnd(&zdata);
+      break;
+#endif
+    }
+
+    case COMP_TYPE_SIMPLE: {
+      RLCOMP_stream sdata;
+      sdata.next_in = in;
+      sdata.avail_in = size - 1;
+      RLCOMP_init(&sdata);
+
+      if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) {
+        throw std::string("Can't allocate output buffer");
+      }
+
+      while (!RLCOMP_decompress(&sdata)) {
+        if (0 != getdata(sdata.next_out, sdata.avail_out,cb)) {
+          throw std::string("Can't allocate output buffer");
+        }
+      }
+
+      result = sdata.output_count;
+      RLCOMP_end(&sdata);
+      break;
+    }
+
+    default:
+      throw std::string("Unknown type of compressed data");
+  }
+
+  return result;
+}
+
+}
+
+// vim: sw=2 ai
author	Reid Spencer <rspencer@reidspencer.com>
	Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)
committer	Reid Spencer <rspencer@reidspencer.com>
	Mon, 4 Oct 2004 10:49:41 +0000 (10:49 +0000)
include/llvm/Support/Compressor.h	[new file with mode: 0644]	patch \| blob
lib/Support/Compressor.cpp	[new file with mode: 0644]	patch \| blob