Explorations into data compression techniques.

Data compression is any method of finding representations for digital information that require fewer bits than the original representation. Compression algorithms typically combine techniques for minimizing redundancy with techniques for efficiently encoding data.

A simple interface for compression algorithms

/* Compress the given input writing the results to out */
uint64_t compress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t max_out_size);

/* Decompress the given compress input writing the uncompressed output to out */
bool decompress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t out_size);

/* Returns the maximum possible compressed size for the given uncompressed input */
uint64_t compress_get_max_output_size(uint8_t *in, uint64_t in_size);

/* Returns the size of the out given a compressed input */
uint64_t decompress_get_output_size(uint8_t *in, uint64_t in_size);

Run-length encoding (RLE) compression

RLE-based compression schemes iterate through input data looking for a repeated sequence of the same byte (called a run). Anything not part of a run is called a literal. Whenever a run is encountered, they output a sequence MLNR where M is the number of literals, L is a sequence of M literal bytes, N is the run length, and R is the byte value repeated. To make this more concrete, an RLE might compress "AAABBBBAACCCC" as "3A4B2A4C"

Decoding is simple and involves walking the compressed input of MLNR sequences and outputting the M literals in L followed by N repetitions of byte R.

In the example below the compressor works by scanning forward from the current input byte up to RLE_MAX_RUN_LENGTH bytes. If repetition of the current input byte is not found then the current input byte is buffered as a literal and it proceeds to the next input byte. However, if a repetition is found then any literals are first flushed to the output buffer followed by the run identified. Additionally, data is flushed if the end of the input is reached or the literal buffer is full.

#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

#define RLE_LITERAL_BUFFER_MAX_SIZE 255 /* Maximum size of a set of literals */
#define RLE_MAX_RUN_LENGTH 255 /* Maximum size of a repeated byte run */
/*
 * Minimum run length (don't encode a run if it is fewer than this many
 * bytes). This can improve compression by only encoding a run when its length
 * justifies the 4-byte cost of writing out the literals and the run.
 */
#define RLE_MIN_RUN_LENGTH 5

/*
 * Performs simple run-length encoding (RLE) compression from the given input
 * stream into the given output stream. Returns the size of the compressed
 * output or 0 if there was an error during compression.
 */
uint64_t compress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t max_out_size)
{
  assert(in_size > 0);
  assert(max_out_size > 0);

  static uint8_t literals[RLE_LITERAL_BUFFER_MAX_SIZE];

  uint8_t *next_in_byte = in;
  // NOTE: The final compressed output size is written into the first 64-bits,
  // so skip past this and start writing the compressed output immediately
  // after it.
  uint8_t *next_out_byte = out + sizeof(uint64_t);
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = out + max_out_size;
  uint8_t literal_count = 0;

  for (;;)
  {
    if (next_out_byte >= out_end)
    {
      fprintf(stderr, "error: end of output with input left (input at byte: %ld)\n",
	      (next_in_byte - in));
      return(0);
    }

    uint8_t run_value = (next_in_byte == in_end) ? 0 : next_in_byte[0];
    uint8_t run_length = 0;

    while (((next_in_byte + run_length) < in_end) &&
	   (run_length < RLE_MAX_RUN_LENGTH) &&
	   (next_in_byte[run_length] == run_value))
    { ++run_length; }

    // If we have a run, we've exhausted our literal buffer, or we're at the
    // end of the input then write the next RLE entry.
    if (run_length >= RLE_MIN_RUN_LENGTH ||
	literal_count == RLE_LITERAL_BUFFER_MAX_SIZE ||
	next_in_byte == in_end)
    {
      *next_out_byte++ = literal_count;
      for (uint32_t literal_index = 0; literal_index < literal_count; ++literal_index)
      {
	*next_out_byte++ = literals[literal_index];
      }
      literal_count = 0;

      *next_out_byte++ = run_length;
      *next_out_byte++ = run_value;

      if (next_in_byte == in_end)
	break;

      next_in_byte += run_length;
    }
    else
    {
      literals[literal_count++] = *next_in_byte++;
    }
  }

  assert(literal_count == 0);
  assert(next_in_byte == in_end);

  // Write the original input size into the output buffer
  *(uint64_t *)out = in_size;

  uint64_t out_size = next_out_byte - out;
  return(out_size);
}

/*
 * Performs simple run-length encoding (RLE) decompression from the given RLE
 * compressed input stream to the given output stream. Returns true if
 * decompression succeeded or false otherwise.
 */
bool decompress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t out_size)
{
  bool result = false;

  in += sizeof(uint64_t);
  in_size -= sizeof(uint64_t);

  uint8_t *next_in_byte = in;
  uint8_t *next_out_byte = out;
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = out + out_size;

  for (;;)
  {
    if (next_in_byte >= in_end)
    {
      result = true;
      break;
    }
    if (next_out_byte >= out_end)
    {
      fprintf(stderr, "error: end of output with input left (input at byte: %ld)\n",
	      (next_in_byte - in));
      break;
    }

    uint8_t literal_count = *next_in_byte++;
    while (literal_count--)
    {
      *next_out_byte++ = *next_in_byte++;
    }

    uint8_t run_length = *next_in_byte++;
    uint8_t run_value = *next_in_byte++;
    while (run_length--)
    {
      *next_out_byte++ = run_value;
    }
  }

  assert(next_in_byte == in_end);
  return(result);
}

/*
 * Returns the maximum possible output size when RLE compressing the given
 * input.
 */
uint64_t compress_get_max_output_size(uint8_t *in, uint64_t in_size)
{
  /*
   * The maximum size contains no runs and all literal sequences of the max
   * literal sequence length (255 bytes). So this means we have each byte of
   * the original input reproduced, plus 3 extra bytes per 255 literal bytes
   * for the literal count, run count, and run value. We add 255 to in_size to
   * ensure we round up in cases where the input size is less than 255 bytes or
   * not an even multiple of 255 bytes.
   */
  uint64_t result = in_size + 3*((in_size + 255) / 255) + sizeof(uint64_t);
  return(result);
}

/*
 * Returns the decompressed output size for the given RLE compressed input.
 */
uint64_t decompress_get_output_size(uint8_t *in, uint64_t in_size)
{
  assert(in_size > 8);
  uint64_t result = *(uint64_t *)in;
  return(result);
}

Sliding window compression (LZ77)

The next step up from simple RLE are the Lempel-Ziv dictionary-based compression schemes LZ77 and LZ78. LZ77-family sliding window compression algorithms iterate through input data within a fixed sized window (the implicit "dictionary") around the current input byte looking not only for repeats of the same byte but repeats of sequences of bytes within that window. Whenever a repeated sequence is encountered they output a pair CD where C is the number of bytes to copy and D is the distance back from the current position in the output stream to the first byte in the sequence to be copied. In our scheme a run of literals is signaled by a D value of zero.

Decompression works by iterating through the CD pairs in the input and either copying C literals that follow (a D value of zero signifies a set of literals) or copying a run of C bytes starting D bytes back in the output stream.

#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

#define LZ77_SLIDING_WINDOW_MAX 255 /* Maximum size of the sliding window */
#define LZ77_LITERAL_BUFFER_MAX_SIZE 255 /* Maximum size of the literal buffer */

uint64_t compress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t max_out_size)
{
  assert(in_size > 0);
  assert(max_out_size > 0);

  static uint8_t literals[LZ77_LITERAL_BUFFER_MAX_SIZE];

  uint8_t *next_in_byte = in;
  // NOTE: The final compressed output size is written into the first 64-bits,
  // so skip past this and start writing the compressed output immediately
  // after it.
  uint8_t *next_out_byte = out + sizeof(uint64_t);
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = out + max_out_size;
  uint8_t literal_count = 0;

  for (;;)
  {
    if (next_in_byte > in_end)
    {
      fprintf(stderr, "error: compression went passed the end of the input\n");
      break;
    }
    if (next_out_byte >= out_end)
    {
      fprintf(stderr, "error: end of output with input left (input at byte: %ld)\n",
	      (next_in_byte - in));
      return(0);
    }

    size_t max_lookback = next_in_byte - in;
    if (max_lookback > LZ77_SLIDING_WINDOW_MAX)
    {
      max_lookback = LZ77_SLIDING_WINDOW_MAX;
    }

    uint8_t best_run = 0;
    uint8_t best_distance = 0;
    for (uint8_t *window_start = next_in_byte - max_lookback;
	 window_start < next_in_byte;
	 ++window_start)
    {
      size_t window_size = in_end - window_start;
      if (window_size > LZ77_SLIDING_WINDOW_MAX)
      {
	window_size = LZ77_SLIDING_WINDOW_MAX;
      }

      uint8_t *window_end = window_start + window_size;
      uint8_t *test_in = next_in_byte;
      uint8_t *window_in = window_start;
      size_t test_run = 0;
      while ((window_in < window_end) &&
	     (test_in < in_end) &&
	     (*test_in++ == *window_in++) &&
	     (test_run < LZ77_SLIDING_WINDOW_MAX))
      {
	++test_run;
      }

      if (best_run < test_run)
      {
	best_run = test_run;
	best_distance = next_in_byte - window_start;
      }
    }

    bool output_run = (
      (literal_count && (best_run > 4)) || 
      (!literal_count && best_run > 2));

    // If we have a run, we've exhausted our literal buffer, or we're at the
    // end of the input then write the next entry.
    if (output_run ||
	literal_count == LZ77_LITERAL_BUFFER_MAX_SIZE ||
	next_in_byte == in_end)
    {
      if (literal_count > 0)
      {
	*next_out_byte++ = literal_count;
	*next_out_byte++ = 0;
	for (uint32_t literal_index = 0; literal_index < literal_count; ++literal_index)
	{
	  *next_out_byte++ = literals[literal_index];
	}
	literal_count = 0;
      }

      if (output_run)
      {
	*next_out_byte++ = best_run;
	*next_out_byte++ = best_distance;
	next_in_byte += best_run;
      }

      if (next_in_byte == in_end)
	break;
    }
    else
    {
      literals[literal_count++] = *next_in_byte++;
    }
  }

  assert(literal_count == 0);
  assert(next_in_byte == in_end);

  *(uint64_t *)out = in_size;

  uint64_t out_size = next_out_byte - out;
  return(out_size);
}

bool decompress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t out_size)
{
  bool result = false;

  // skip uncompressed input size that starts input
  in += sizeof(uint64_t);
  in_size -= sizeof(uint64_t);

  uint8_t *next_in_byte = in;
  uint8_t *next_out_byte = out;
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = out + out_size;

  for (;;)
  {
    if (next_in_byte >= in_end)
    {
      result = true;
      break;
    }
    if (next_out_byte >= out_end)
    {
      fprintf(stderr, "error: end of output with input left (input byte at: %ld)\n",
	      (next_in_byte - in));
      break;
    }

    uint8_t count = *next_in_byte++;
    uint8_t copy_offset = *next_in_byte++;
    uint8_t *source = next_out_byte - copy_offset;
    if (copy_offset == 0) // Zero offset means we are copying literals
    {
      source = next_in_byte;
      next_in_byte += count;
    }

    while (count--)
    {
      *next_out_byte++ = *source++;
    }
  }

  assert(next_in_byte == in_end);
  return(result);
}

uint64_t compress_get_max_output_size(uint8_t *in, uint64_t in_size)
{
  /*
   * In the worst case we reproduce every byte of the input stream in some
   * 255-byte literal run plus we add an extra 2 bytes every 255 input bytes to
   * signal the literal run and its length.
   */
  uint64_t result = (in_size + 2 * (in_size / 255)) + sizeof(uint64_t);
  return(result);
}

uint64_t decompress_get_output_size(uint8_t *in, uint64_t in_size)
{
  uint64_t result = *(uint64_t *)in;
  return(result);
}

Dictionary compression (LZ78)

Dictionary compression techniques add a new data-structure - a simple trie called the dictionary. For each item inserted into the dictionary a pair (previous dictionary index, value) is stored. This structure allows for the recording of long sequences of data as chains of these pairs. Whenever a previously recognized sequence is seen in the input stream we can output a code indicating its place in the dictionary instead. A very simple version of the dictionary data structure is as follows:

typedef struct LZ78_Dictionary {
  uint32_t next_available_entry;
  uint16_t previous_index[LZ78_DICTIONARY_SIZE];
  uint8_t value[LZ78_DICTIONARY_SIZE];
} LZ78_Dictionary;

Compression starts with a value last_matching_index = 0 and walks the input stream a byte at a time. For each byte it first attempts to find the pair (last_matching_index, byte) in the dictionary as match_index. If the pair is found then nothing is output and last_matching_index = match_index. If the pair is not found, then the pair (last_matching_index, byte) is added to the dictionary and two values are written to the output stream - firstly last_matching_index, then the byte value. After this is done last_matching_index = 0 and walking the input stream resumes.

Decoding reads the compressed input in pairs (last_matching_index, byte). If last_matching_index is zero, then byte is written to the output stream and the pair (0, byte) is added to the dictionary. If last_matching_index is not zero, then the chain of entries start at last_matching_index is read from the dictionary and written to the output stream in reverse followed by byte. Storing the dictionary is unnecessary as it is reconstructed during the decompression process.

#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

#define LZ78_DICTIONARY_SIZE 4096

typedef struct LZ78_Dictionary {
  uint32_t next_available_entry;
  uint16_t previous_index[LZ78_DICTIONARY_SIZE];
  uint8_t value[LZ78_DICTIONARY_SIZE];
} LZ78_Dictionary;

static void
dictionary_init(LZ78_Dictionary *dictionary)
{
  dictionary->next_available_entry = 0;
}

static bool
dictionary_is_full(LZ78_Dictionary *dictionary)
{
  return(dictionary->next_available_entry >= LZ78_DICTIONARY_SIZE);
}

static bool
dictionary_find_entry(LZ78_Dictionary *dictionary, uint16_t last_match_index,
                      uint8_t value, uint16_t *out_index)
{
  bool result = false;
  for (uint32_t i = 0; i < dictionary->next_available_entry; ++i)
  {
    if (dictionary->previous_index[i] == last_match_index &&
        dictionary->value[i] == value)
    {
      result = true;
      *out_index = i+1;
      break;
    }
  }
  return(result);
}

static uint16_t
dictionary_add_entry(LZ78_Dictionary *dictionary, uint16_t previous_index, uint8_t value)
{
  /* Throw away the entire dictionary if it's full */
  if(dictionary_is_full(dictionary))
  {
    dictionary_init(dictionary);
    previous_index = 0;
  }
  uint16_t entry_index = dictionary->next_available_entry;
  assert(entry_index+1 != previous_index);
  dictionary->previous_index[entry_index] = previous_index;
  dictionary->value[entry_index] = value;
  ++dictionary->next_available_entry;
  return(dictionary->next_available_entry);
}

static uint16_t
dictionary_read_entry(LZ78_Dictionary *dictionary, uint16_t entry_index,
                      uint8_t *out, uint8_t *out_end)
{
  assert(entry_index <= LZ78_DICTIONARY_SIZE);

  static uint8_t value_buffer[LZ78_DICTIONARY_SIZE];

  uint16_t result = 0;
  int32_t value_count = 0;
  uint16_t next_entry = entry_index - 1;
  for (;;)
  {
    value_buffer[value_count++] = dictionary->value[next_entry];
    next_entry = dictionary->previous_index[next_entry];
    if (next_entry == 0)
      break;
    next_entry -= 1;
  }

  assert(out < out_end);
  assert(value_count <= LZ78_DICTIONARY_SIZE);
  result = value_count;

  for (int32_t i = value_count-1; i >= 0; --i)
  {
    *out++ = value_buffer[i];
  }

  return(result);
}

uint64_t compress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t max_out_size)
{
  max_out_size -= sizeof(uint64_t);

  uint8_t *next_in_byte = in;
  uint8_t *next_out_byte = out + sizeof(uint64_t);
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = next_out_byte + max_out_size;

  uint16_t last_matching_index = 0;
  uint16_t match_length = 0;
  LZ78_Dictionary dictionary = { 0 };
  dictionary_init(&dictionary);

  for (;;)
  {
    if (next_in_byte >= in_end)
      break;
    if (next_out_byte >= out_end)
    {
      fprintf(stderr, "error: end of output with input remaining\n");
      break;
    }

    uint16_t out_index;
    uint8_t value = *next_in_byte++;

    if (dictionary_find_entry(&dictionary, last_matching_index, value, &out_index) &&
	next_in_byte < in_end)
    {
      ++match_length;
      last_matching_index = out_index;
    }
    else
    {
      dictionary_add_entry(&dictionary, last_matching_index, value);
      *((uint16_t*)next_out_byte) = last_matching_index;
      next_out_byte += 2;
      *next_out_byte++ = value;
      last_matching_index = 0;
      match_length = 0;
    }
  }

  assert(next_in_byte == in_end);

  *(uint64_t *)out = in_size;
  uint64_t out_size = (next_out_byte - out);
  return(out_size);
}

bool decompress(uint8_t *in, uint64_t in_size, uint8_t *out, uint64_t out_size)
{
  bool result = true;
  in += sizeof(uint64_t);
  in_size -= sizeof(uint64_t);

  assert(in_size >= 3);

  uint8_t *next_in_byte = in;
  uint8_t *next_out_byte = out;
  uint8_t *in_end = in + in_size;
  uint8_t *out_end = out + out_size;

  LZ78_Dictionary dictionary = { 0 };
  dictionary_init(&dictionary);

  for(;;)
  {
    if (next_in_byte >= in_end)
      break;
    if (next_out_byte >= out_end)
    {
      result = false;
      fprintf(stderr, "error: end of output with input left\n");
      break;
    }

    uint16_t code_word = *(uint16_t*)next_in_byte;
    next_in_byte += 2;
    uint8_t value = *next_in_byte++;

    if (code_word)
    {
      uint16_t bytes_read = dictionary_read_entry(
	  &dictionary, code_word, next_out_byte, out_end);
      next_out_byte += bytes_read;
      *next_out_byte++ = value;
      dictionary_add_entry(&dictionary, code_word, value);
    }
    else
    {
      *next_out_byte++ = value;
      dictionary_add_entry(&dictionary, 0, value);
    }
  }

  assert(next_in_byte == in_end);
  return(result);
}

uint64_t compress_get_max_output_size(uint8_t *in, uint64_t in_size)
{
  /*
   * In the worst case we reproduce every byte of the input stream plus 2 extra
   * bytes for the code word (which is in every case 0x0000).
   */
  uint64_t result = 3*in_size + sizeof(uint64_t);
  return(result);
}

uint64_t decompress_get_output_size(uint8_t *in, uint64_t in_size)
{
  uint64_t result = *(uint64_t *)in;
  return(result);
}

Last update on 7E5B17, edited 1 times. 6/6thh