/* inffast.c -- fast decoding
 * Copyright (C) 1995-2004 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * Modified March 2005 by John Reiser for little-endian, ~0, tail merge,
 * out-of-line error cases, nuanced comments about copying from window.
 * Copyright 2005 BitWagon Software LLC.  Original was from zlib-1.2.2.
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"

#ifndef ASMINF

/* Little-endian memory has a speed advantage of about 1%. */
static int const INFFAST_LITTLE = 1;  /* Rely on constant propagation. */
/* If your compiler doesn't propagate the "static int const",
 * then replace with "#define INFFAST_LITTLE 1".
 * Little-endian also may benefit from "#define POSTINC 1".
 */

/* Allow machine dependent optimization for post-increment or pre-increment.
   Based on testing to date,
   Pre-increment preferred for:
   - PowerPC G3 (Adler)
   - MIPS R5000 (Randers-Pehrson)
   Post-increment preferred for:
   - none
   No measurable difference:
   - Pentium III (Anderson)
   - M68060 (Nikl)
 */
#ifdef POSTINC
#  define OFF 0
#  define PUP(a) *(a)++
#else
#  define OFF 1
#  define PUP(a) *++(a)
#endif

/*
   Decode literal, length, and distance codes and write out the resulting
   literal and match bytes until either not enough input or output is
   available, an end-of-block is encountered, or a data error is encountered.
   When large enough input and output buffers are supplied to inflate(), for
   example, a 16K input buffer and a 64K output buffer, more than 95% of the
   inflate execution time is spent in this routine.

   Entry assumptions:

        state->mode == LEN
        strm->avail_in >= 6
        strm->avail_out >= 258
        start >= strm->avail_out
        state->bits < 8

   On return, state->mode is one of:

        LEN -- ran out of enough output space or enough available input
        TYPE -- reached end of block code, inflate() to interpret next block
        BAD -- error in block data

   Notes:

    - The maximum input bits used by a length/distance pair is 15 bits for the
      length code, 5 bits for the length extra, 15 bits for the distance code,
      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
      Therefore if strm->avail_in >= 6, then there is enough input to avoid
      checking for available input while decoding.

    - The maximum bytes that a single length/distance pair can output is 258
      bytes, which is the maximum length that can be coded.  inflate_fast()
      requires strm->avail_out >= 258 for each loop to avoid checking for
      output space.
 */

#define COPY(     dst,src,len)       copy(dst, src, len)
#define COPY_TAIL(dst,back,len) copy_tail(dst, back, len)
#define COPY_INIT(dst)          copy_init(dst)
#define COPY_FINI(dst)          copy_fini(dst)
#define COPY_PUTC(dst,c)        copy_putc(dst, c)

static
unsigned char FAR *
copy(unsigned char FAR *dst, unsigned char FAR *src, unsigned len)
{
    --len;
    goto middle;
    do {
        PUP(dst) = PUP(src); len-=3;
        PUP(dst) = PUP(src);
middle:
        PUP(dst) = PUP(src);
    } while (2<len);
    if (0!=len) {
        PUP(dst) = PUP(src);
        if (1<len) {
            PUP(dst) = PUP(src);
        }
    }
    return dst;
}

static
unsigned char FAR *
copy_tail(unsigned char FAR *dst, unsigned back, unsigned len)
{
	return copy(dst, dst - back, len);
}

static
unsigned char FAR *
copy_init(unsigned char FAR *dst)
{
	return dst;
}

static
unsigned char FAR *
copy_fini(unsigned char FAR *dst)
{
	return dst;
}

static
unsigned char FAR *
copy_putc(unsigned char FAR *dst, unsigned char c)
{
	PUP(dst) = c;
	return dst;
}


/* Retain bit_count low-order bits, set remaining high-order bits to 0:
 *    value &~ (~0U << bit_count)
 * On a machine that has "AND with complement" instruction, also called
 * "bit clear" or "bic", [such as PowerPC, ARM, or XSCALE] this is often
 * one instruction smaller and faster than using
 *    value & ((1U << bit_count) -1)
 * On ARM or XSCALE, it can be two instructions smaller and faster because
 * the right-hand operand can be shifted in the same instruction.
 * The time savings is about 1%.
 * There should be no difference in speed if the machine does not have 'bic'.
 */

void inflate_fast(strm, start)
z_streamp strm;
unsigned start;         /* inflate()'s starting value for strm->avail_out */
{
    /* Copy state to local variables.  Use 'const' as much as possible
     * for clarity and to help the compiler.
     */
    struct inflate_state FAR *const state
                         = (struct inflate_state FAR *)strm->state;
    unsigned char FAR *in       /* local strm->next_in */
                         = strm->next_in - OFF;
    unsigned char FAR *const last  /* while in < last, enough input available */
                         = in + (strm->avail_in - 5);
    unsigned char FAR *out      /* local strm->next_out */
                         = strm->next_out - OFF;
    unsigned char FAR *const beg   /* inflate()'s initial strm->next_out */
                         = out - (start - strm->avail_out);
    unsigned char FAR *const end   /* while out < end, enough space available */
                         = out + (strm->avail_out - 257);
    unsigned const wsize        /* window size or zero if not using window */
                         = state->wsize;
    unsigned const whave        /* valid bytes in the window */
                         = state->whave;
    unsigned const write        /* window write index */
	                 = state->write;
    unsigned char FAR *const window  /* allocated sliding window, if wsize != 0 */
                         = state->window;
    unsigned long hold          /* local strm->hold */
                         = state->hold;
    unsigned bits               /* local strm->bits */
                         = state->bits;
    code const FAR *const lcode /* local strm->lencode */
                         = state->lencode;
    code const FAR *const dcode /* local strm->distcode */
                         = state->distcode;
    unsigned const lmask        /* mask for first level of length codes */
                         = ~(~0U << state->lenbits);
    unsigned const dmask        /* mask for first level of distance codes */
                         = ~(~0U << state->distbits);
    code this;                  /* retrieved table entry */
    unsigned op;                /* code bits, operation, extra bits, or */
                                /*  window position, window bytes to copy */
    char const *errmsg;

    out = COPY_INIT(out);

    /* From now on, use "strm->state", to free up a register.
       We also want strm not in a register during the loop,
       but gcc 3.4.1 uses one anyway.
    */

    if (INFFAST_LITTLE && (1& (int)(OFF + in))) {
        hold += (unsigned long)*(OFF + in) << bits;
        in += 1;
        bits += 8;
    }
    /* decode literals and length/distances until end-of-block or not enough
       input data or output space */
    do {
        if (bits < 15) {
            if (INFFAST_LITTLE) {
                hold += (unsigned long)*(unsigned short *)(OFF + in) << bits;
                in += 2; bits += 16;
            }
            else {
                hold += (unsigned long)(PUP(in)) << bits;
                bits += 8;
                hold += (unsigned long)(PUP(in)) << bits;
                bits += 8;
            }
        }
        this = lcode[hold & lmask];
      dolen:
        op = (unsigned)(this.bits);
        hold >>= op;
        bits -= op;
        op = (unsigned)(this.op);
        if (op == 0) {                          /* literal */
            Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
                    "inflate:         literal '%c'\n" :
                    "inflate:         literal 0x%02x\n", this.val));
            out= COPY_PUTC(out, (unsigned char)this.val);
        }
        else if (op & 16) {                     /* length base */
            unsigned len = (unsigned)(this.val);
            op &= 15;                           /* number of extra bits */
            if (op) {
                if (bits < op) {
                    if (INFFAST_LITTLE) {
                        hold += (unsigned long)*(unsigned short *)(OFF + in) << bits;
                        in += 2; bits += 16;
                    }
                    else {
                        hold += (unsigned long)(PUP(in)) << bits;
                        bits += 8;
                    }
                }
                len += (unsigned)hold &~ (~0U << op);
                hold >>= op;
                bits -= op;
            }
            Tracevv((stderr, "inflate:         length %u\n", len));
            if (bits < 15) {
                if (INFFAST_LITTLE) {
                    hold += (unsigned long)*(unsigned short *)(OFF + in) << bits;
                    in += 2; bits += 16;
                }
                else {
                    hold += (unsigned long)(PUP(in)) << bits;
                    bits += 8;
                    hold += (unsigned long)(PUP(in)) << bits;
                    bits += 8;
                }
            }
            this = dcode[hold & dmask];
          dodist:
            op = (unsigned)(this.bits);
            hold >>= op;
            bits -= op;
            op = (unsigned)(this.op);
            if (op & 16) {                      /* distance base */
                unsigned dist = (unsigned)(this.val);
                op &= 15;                       /* number of extra bits */
                if (bits < op) {
                    if (INFFAST_LITTLE) {
                        hold += (unsigned long)*(unsigned short *)(OFF + in) << bits;
                        in += 2; bits += 16;
                    }
                    else {
                        hold += (unsigned long)(PUP(in)) << bits;
                        bits += 8;
                        if (bits < op) {
                            hold += (unsigned long)(PUP(in)) << bits;
                            bits += 8;
                        }
                    }
                }
                dist += (unsigned)hold &~ (~0U << op);
                hold >>= op;
                bits -= op;
                Tracevv((stderr, "inflate:         distance %u\n", dist));
                op = (unsigned)(out - beg);     /* max distance in output */
                if (dist > op) {                /* see if copy from window */
                    unsigned char FAR *from;
                    op = dist - op;             /* distance back in window */
                    if (op > whave) {
                        errmsg= "invalid distance too far back";
                        goto error;
                    }
                    from = window - OFF;
                    if (write == 0) {           /* common: full or dictionary */
                        from += wsize - op;
                        goto merge;             /* not all from window */
                    }
                    else if (write < op) {      /* wrap around window */
                        from += wsize + write - op;
                        op -= write;
                        if (op < len) {         /* not all from end of window */
                            len -= op;
                            out= COPY(out, from, op);
                            op = write;  /* enlarge tail merge for compiler */
                            from = window - OFF;
                            goto merge;         /* some from start of window */
                        }
                    }
                    else {                      /* contiguous in window */
                        from += write - op;
merge:
                        if (op < len) {         /* not all from window */
                            len -= op;
                            out= COPY(out, from, op);
                            goto tail;          /* rest direct from output */
                        }
                    }
                    out= COPY(out, from, len);  /* all from window */
                }
                else {
                    /* copy direct from output */
tail:
                    out= COPY_TAIL(out, dist, len);
                }
            }
            else if ((op & 64) == 0) {          /* 2nd level distance code */
                this = dcode[this.val + (hold &~ (~0U << op))];
                goto dodist;
            }
            else {
                errmsg= "invalid distance code";
                goto error;
            }
        }
        else if ((op & 64) == 0) {              /* 2nd level length code */
            this = lcode[this.val + (hold &~ (~0U << op))];
            goto dolen;
        }
        else if (op & 32) {                     /* end-of-block */
            goto end_of_block;
        }
        else {
            errmsg= "invalid literal/length code";
            goto error;
        }
    } while (in < last && out < end);

no_err:
    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
    in -= bits >> 3;
    ((struct inflate_state FAR *)(strm->state))->bits = bits &= ~(~0U << 3);
    ((struct inflate_state FAR *)(strm->state))->hold = hold &  ~(~0U << bits);

    /* update state and return */
    strm->next_in = in + OFF;
    strm->next_out = COPY_FINI(out + OFF);

/* There is no circular buffer.  Commutative, associative, distributive rules.
 *  strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
 *  strm->avail_out = (unsigned)(out < end ?
 *                               257 + (end - out) : 257 - (out - end));
 */
    strm->avail_in = 5 + (last - in);
    strm->avail_out = 257 + (end - out);
    return;

/* Put error and unusual cases completely outside the main loop. */
error:
    strm->msg = (char *)errmsg;
    ((struct inflate_state FAR *)(strm->state))->mode = BAD;
    goto no_err;

end_of_block:
    Tracevv((stderr, "inflate:         end of block\n"));
    ((struct inflate_state FAR *)(strm->state))->mode = TYPE;
    goto no_err;
}

/*
   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
   - Using bit fields for code structure
   - Different op definition to avoid & for extra bits (do & for table bits)
   - Three separate decoding do-loops for direct, window, and write == 0
   - Special case for distance > 1 copies to do overlapped load and store copy
   - Explicit branch predictions (based on measured branch probabilities)
   - Deferring match copy and interspersed it with decoding subsequent codes
   - Swapping literal/length else
   - Swapping window/direct else
   - Larger unrolled copy loops (three is about right)
   - Moving len -= 3 statement into middle of loop
 */

#endif /* !ASMINF */
