common-src/amcrc32chw.c - amanda (HEAD)

Tree @HEAD (Download .tar.gz)

amcrc32chw.c @HEAD — raw · history · blame

/*
 * Amanda, The Advanced Maryland Automatic Network Disk Archiver
 * Copyright (c) 2007-2012 Zmanda, Inc.  All Rights Reserved.
 * Copyright (c) 2013-2016 Carbonite, Inc.  All Rights Reserved.
 * All Rights Reserved.
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of U.M. not be used in advertising or
 * publicity pertaining to distribution of the software without specific,
 * written prior permission.  U.M. makes no representations about the
 * suitability of this software for any purpose.  It is provided "as is"
 * without express or implied warranty.
 *
 * U.M. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL U.M.
 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * Authors: the Amanda Development Team.  Its members are listed in a
 * file named AUTHORS, in the root directory of this distribution.
 */

#include <amanda.h>
#include <amutil.h>
#include <amcrc32chw.h>

#ifdef __SSE4_2__
gboolean compiled_with_sse4_2 = TRUE;
#define POLY 0x82F63B78

/* Multiply a matrix times a vector over the Galois field of two elements,
 * GF(2).  Each element is a bit in an unsigned integer.  mat must have at
 * least as many entries as the power of two for most significant one bit in
 * vec. */
static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
{
    uint32_t sum;

    sum = 0;
    while (vec) {
        if (vec & 1)
            sum ^= *mat;
        vec >>= 1;
        mat++;
    }
    return sum;
}

/* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
 * rows. */
static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat)
{
    int n;

    for (n = 0; n < 32; n++)
        square[n] = gf2_matrix_times(mat, mat[n]);
}

/* Construct an operator to apply len zeros to a crc.  len must be a power of
 * two.  If len is not a power of two, then the result is the same as for the
 * largest power of two less than len.  The result for len == 0 is the same as
 * for len == 1.  A version of this routine could be easily written for any
 * len, but that is not needed for this application. */
static void crc32c_zeros_op(uint32_t *even, size_t len)
{
    int n;
    uint32_t row;
    uint32_t odd[32];       /* odd-power-of-two zeros operator */

    /* put operator for one zero bit in odd */
    odd[0] = POLY;              /* CRC-32C polynomial */
    row = 1;
    for (n = 1; n < 32; n++) {
        odd[n] = row;
        row <<= 1;
    }

    /* put operator for two zero bits in even */
    gf2_matrix_square(even, odd);

    /* put operator for four zero bits in odd */
    gf2_matrix_square(odd, even);

    /* first square will put the operator for one zero byte (eight zero bits),
     * in even -- next square puts operator for two zero bytes in odd, and so
     * on, until len has been rotated down to zero */
    do {
        gf2_matrix_square(even, odd);
        len >>= 1;
        if (len == 0)
            return;
        gf2_matrix_square(odd, even);
        len >>= 1;
    } while (len);

    /* answer ended up in odd -- copy to even */
    for (n = 0; n < 32; n++)
        even[n] = odd[n];
}

/* Take a length and build four lookup tables for applying the zeros operator
 * for that length, byte-by-byte on the operand. */
static void crc32c_zeros(uint32_t zeros[][256], size_t len)
{
    uint32_t n;
    uint32_t op[32];

    crc32c_zeros_op(op, len);
    for (n = 0; n < 256; n++) {
        zeros[0][n] = gf2_matrix_times(op, n);
        zeros[1][n] = gf2_matrix_times(op, n << 8);
        zeros[2][n] = gf2_matrix_times(op, n << 16);
        zeros[3][n] = gf2_matrix_times(op, n << 24);
    }
}

/* Apply the zeros operator table to crc. */
static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
{
   uint32_t a=
    zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
           zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
   return a;
}

/* Block sizes for three-way parallel crc computation.  LONG and SHORT must
 * both be powers of two.  The associated string constants must be set
 * accordingly, for use in constructing the assembler instructions. */
#define LONG 8192
#define SHORT 256
#define LOW 128

/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
static uint32_t crc32c_long[4][256];
static uint32_t crc32c_short[4][256];
static uint32_t crc32c_low[4][256];

/* Initialize tables for shifting crcs. */
void
crc32c_init_hw(void)
{
    crc32c_zeros(crc32c_long, LONG);
    crc32c_zeros(crc32c_short, SHORT);
    crc32c_zeros(crc32c_low, LOW);
}

typedef struct {
  union {
    const uint8_t  *b8;
    const uint32_t *b32;
    const uint64_t *b64;
  } b;
} multi_b;

/* Compute CRC-32C using the Intel hardware instruction. */
void crc32c_add_hw(uint8_t *buf, size_t len, crc_t *crc)
{

    multi_b next;
    multi_b end;
    uint32_t crc32_0;
#ifdef __x86_64__
    uint64_t *next64_1;
    uint64_t *next64_2;
    uint64_t *next64_3;
    uint64_t crc64_0, crc64_1, crc64_2, crc64_3; /* need to be 64 bits for crc32q */
#else
    uint32_t *next32_1;
    uint32_t *next32_2;
    uint32_t *next32_3;
    uint32_t crc32_1, crc32_2, crc32_3;
#endif

    next.b.b8 = buf;
    crc->size += len;
    crc32_0 = (uint64_t)crc->crc;
    /* pre-process the crc */
    /* compute the crc for up to seven leading bytes to bring the data pointer
     * to an eight-byte boundary */
    while (len && ((uintptr_t)next.b.b8 & 7) != 0) {
	crc32_0 = __builtin_ia32_crc32qi(crc32_0, *next.b.b8);
        next.b.b8++;
        len--;
    }

#ifdef __x86_64__
    /* compute the crc on sets of LONG*4 bytes, executing three independent crc
     * instructions, each on LONG bytes -- this is optimized for the Nehalem,
     * Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
     * throughput of one crc per cycle, but a latency of three cycles */
    crc64_0 = (uint64_t)crc32_0;
    while (len >= LONG*4) {
        crc64_1 = 0;
        crc64_2 = 0;
        crc64_3 = 0;
	next64_1 = (uint64_t *)(next.b.b64+LONG/8);
	next64_2 = (uint64_t *)(next.b.b64+(LONG/8)*2);
	next64_3 = (uint64_t *)(next.b.b64+(LONG/8)*3);
	end.b.b64 = next64_1;
        do {
	    crc64_0 = __builtin_ia32_crc32di(crc64_0, *next.b.b64++);
	    crc64_1 = __builtin_ia32_crc32di(crc64_1, *next64_1++);
	    crc64_2 = __builtin_ia32_crc32di(crc64_2, *next64_2++);
	    crc64_3 = __builtin_ia32_crc32di(crc64_3, *next64_3++);
        } while (next.b.b64 < end.b.b64);
        crc64_0 = crc32c_shift(crc32c_long, (uint32_t)crc64_0) ^ crc64_1;
        crc64_0 = crc32c_shift(crc32c_long, (uint32_t)crc64_0) ^ crc64_2;
        crc64_0 = crc32c_shift(crc32c_long, (uint32_t)crc64_0) ^ crc64_3;
        len -= LONG*4;
	next.b.b64 = next64_3;
    }

    /* do the same thing, but now on SHORT*4 blocks for the remaining data less
     * than a LONG*4 block */
    while (len >= SHORT*4) {
        crc64_1 = 0;
        crc64_2 = 0;
        crc64_3 = 0;
	next64_1 = (uint64_t *)(next.b.b64+SHORT/8);
	next64_2 = (uint64_t *)(next.b.b64+(SHORT/8)*2);
	next64_3 = (uint64_t *)(next.b.b64+(SHORT/8)*3);
        end.b.b64 = next64_1;
        do {
	    crc64_0 = __builtin_ia32_crc32di(crc64_0, *next.b.b64++);
	    crc64_1 = __builtin_ia32_crc32di(crc64_1, *next64_1++);
	    crc64_2 = __builtin_ia32_crc32di(crc64_2, *next64_2++);
	    crc64_3 = __builtin_ia32_crc32di(crc64_3, *next64_3++);
        } while (next.b.b64 < end.b.b64);
        crc64_0 = crc32c_shift(crc32c_short, crc64_0) ^ crc64_1;
        crc64_0 = crc32c_shift(crc32c_short, crc64_0) ^ crc64_2;
        crc64_0 = crc32c_shift(crc32c_short, crc64_0) ^ crc64_3;
        len -= SHORT*4;
	next.b.b64 = next64_3;
    }

    /* compute the crc on the remaining eight-byte units less than a SHORT*3
     * block */
    end.b.b8 = next.b.b8 + (len - (len & 7));
    while (next.b.b64 < end.b.b64) {
	crc64_0 = __builtin_ia32_crc32di(crc64_0, *next.b.b64++);
    }
    len &= 7;
    crc32_0 = (uint32_t)crc64_0;

#else
    /* compute the crc on sets of LONG*4 bytes, executing three independent crc
     * instructions, each on LONG bytes -- this is optimized for the Nehalem,
     * Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
     * throughput of one crc per cycle, but a latency of three cycles */
    while (len >= LONG*4) {
        crc32_1 = 0;
        crc32_2 = 0;
        crc32_3 = 0;
	next32_1 = (uint32_t *)(next.b.b32+LONG/4);
	next32_2 = (uint32_t *)(next.b.b32+(LONG/4)*2);
	next32_3 = (uint32_t *)(next.b.b32+(LONG/4)*3);
	end.b.b32 = next32_1;
        do {
	    crc32_0 = __builtin_ia32_crc32si(crc32_0, *next.b.b32++);
	    crc32_1 = __builtin_ia32_crc32si(crc32_1, *next32_1++);
	    crc32_2 = __builtin_ia32_crc32si(crc32_2, *next32_2++);
	    crc32_3 = __builtin_ia32_crc32si(crc32_3, *next32_3++);
        } while (next.b.b64 < end.b.b64);
        crc32_0 = crc32c_shift(crc32c_long, crc32_0) ^ crc32_1;
        crc32_0 = crc32c_shift(crc32c_long, crc32_0) ^ crc32_2;
        crc32_0 = crc32c_shift(crc32c_long, crc32_0) ^ crc32_3;
        len -= LONG*4;
	next.b.b32 = next32_3;
    }

    /* do the same thing, but now on SHORT*4 blocks for the remaining data less
     * than a LONG*4 block */
    while (len >= SHORT*4) {
        crc32_1 = 0;
        crc32_2 = 0;
        crc32_3 = 0;
	next32_1 = (uint32_t *)(next.b.b32+SHORT/4);
	next32_2 = (uint32_t *)(next.b.b32+(SHORT/4)*2);
	next32_3 = (uint32_t *)(next.b.b32+(SHORT/4)*3);
        end.b.b32 = next32_1;
        do {
	    crc32_0 = __builtin_ia32_crc32si(crc32_0, *next.b.b32++);
	    crc32_1 = __builtin_ia32_crc32si(crc32_1, *next32_1++);
	    crc32_2 = __builtin_ia32_crc32si(crc32_2, *next32_2++);
	    crc32_3 = __builtin_ia32_crc32si(crc32_3, *next32_3++);
        } while (next.b.b32 < end.b.b32);
        crc32_0 = crc32c_shift(crc32c_short, crc32_0) ^ crc32_1;
        crc32_0 = crc32c_shift(crc32c_short, crc32_0) ^ crc32_2;
        crc32_0 = crc32c_shift(crc32c_short, crc32_0) ^ crc32_3;
        len -= SHORT*4;
	next.b.b32 = next32_3;
    }

    /* compute the crc on the remaining eight-byte units less than a SHORT*3
     * block */
    end.b.b8 = next.b.b8 + (len - (len & 7));
    while (next.b.b32 < end.b.b32) {
	crc32_0 = __builtin_ia32_crc32si(crc32_0, *next.b.b32++);
    }
    len &= 7;
#endif

    /* compute the crc for up to seven trailing bytes */
    crc->crc = crc32_0;
    switch (len) {
        case 7:
            crc->crc = __builtin_ia32_crc32qi(crc->crc, *next.b.b8++);
	    // fall through
        case 6:
            crc->crc = __builtin_ia32_crc32hi(crc->crc, *(uint16_t*) next.b.b8);
            next.b.b8 += 2;
	    // fall through
        // case 5 is below: 4 + 1
        case 4:
            crc->crc = __builtin_ia32_crc32si(crc->crc, *(uint32_t*) next.b.b8);
            break;
        case 3:
            crc->crc = __builtin_ia32_crc32qi(crc->crc, *next.b.b8++);
	    // fall through
        case 2:
            crc->crc = __builtin_ia32_crc32hi(crc->crc, *(uint16_t*) next.b.b8);
            break;
        case 5:
            crc->crc = __builtin_ia32_crc32si(crc->crc, *(uint32_t*) next.b.b8);
            next.b.b8 += 4;
	    // fall through
        case 1:
            crc->crc = __builtin_ia32_crc32qi(crc->crc, *next.b.b8);
            break;
        case 0:
            break;
        default:
            // This should never happen; enable in debug code
            //assert(false);
	    break;
    }
}

#else
gboolean compiled_with_sse4_2 = FALSE;

void
crc32c_init_hw(void)
{
   g_error("crc32c_init_hw is not defined");
}

void crc32c_add_hw(
    uint8_t *buf G_GNUC_UNUSED,
    size_t len G_GNUC_UNUSED,
    crc_t *crc G_GNUC_UNUSED)
{
   g_error("crc32c_add_hw is not defined");
}

#endif