/**
 * Author......: See docs/credits.txt
 * License.....: MIT
 */

#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.h"
#include "inc_common.h"

/**
 * vendor specific (or generic) functions
 */

DECLSPEC u8 v8a_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v8.a;
}

DECLSPEC u8 v8b_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v8.b;
}

DECLSPEC u8 v8c_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v8.c;
}

DECLSPEC u8 v8d_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v8.d;
}

DECLSPEC u8 v8a_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.a;
}

DECLSPEC u8 v8b_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.b;
}

DECLSPEC u8 v8c_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.c;
}

DECLSPEC u8 v8d_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.d;
}

DECLSPEC u8 v8e_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.e;
}

DECLSPEC u8 v8f_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.f;
}

DECLSPEC u8 v8g_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.g;
}

DECLSPEC u8 v8h_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v8.h;
}

DECLSPEC u8x v8a_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8a_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8a_from_v64_S (a.s0);
  r.s1 = v8a_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8a_from_v64_S (a.s2);
  r.s3 = v8a_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8a_from_v64_S (a.s4);
  r.s5 = v8a_from_v64_S (a.s5);
  r.s6 = v8a_from_v64_S (a.s6);
  r.s7 = v8a_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8a_from_v64_S (a.s8);
  r.s9 = v8a_from_v64_S (a.s9);
  r.sa = v8a_from_v64_S (a.sa);
  r.sb = v8a_from_v64_S (a.sb);
  r.sc = v8a_from_v64_S (a.sc);
  r.sd = v8a_from_v64_S (a.sd);
  r.se = v8a_from_v64_S (a.se);
  r.sf = v8a_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8b_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8b_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8b_from_v64_S (a.s0);
  r.s1 = v8b_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8b_from_v64_S (a.s2);
  r.s3 = v8b_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8b_from_v64_S (a.s4);
  r.s5 = v8b_from_v64_S (a.s5);
  r.s6 = v8b_from_v64_S (a.s6);
  r.s7 = v8b_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8b_from_v64_S (a.s8);
  r.s9 = v8b_from_v64_S (a.s9);
  r.sa = v8b_from_v64_S (a.sa);
  r.sb = v8b_from_v64_S (a.sb);
  r.sc = v8b_from_v64_S (a.sc);
  r.sd = v8b_from_v64_S (a.sd);
  r.se = v8b_from_v64_S (a.se);
  r.sf = v8b_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8c_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8c_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8c_from_v64_S (a.s0);
  r.s1 = v8c_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8c_from_v64_S (a.s2);
  r.s3 = v8c_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8c_from_v64_S (a.s4);
  r.s5 = v8c_from_v64_S (a.s5);
  r.s6 = v8c_from_v64_S (a.s6);
  r.s7 = v8c_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8c_from_v64_S (a.s8);
  r.s9 = v8c_from_v64_S (a.s9);
  r.sa = v8c_from_v64_S (a.sa);
  r.sb = v8c_from_v64_S (a.sb);
  r.sc = v8c_from_v64_S (a.sc);
  r.sd = v8c_from_v64_S (a.sd);
  r.se = v8c_from_v64_S (a.se);
  r.sf = v8c_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8d_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8d_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8d_from_v64_S (a.s0);
  r.s1 = v8d_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8d_from_v64_S (a.s2);
  r.s3 = v8d_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8d_from_v64_S (a.s4);
  r.s5 = v8d_from_v64_S (a.s5);
  r.s6 = v8d_from_v64_S (a.s6);
  r.s7 = v8d_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8d_from_v64_S (a.s8);
  r.s9 = v8d_from_v64_S (a.s9);
  r.sa = v8d_from_v64_S (a.sa);
  r.sb = v8d_from_v64_S (a.sb);
  r.sc = v8d_from_v64_S (a.sc);
  r.sd = v8d_from_v64_S (a.sd);
  r.se = v8d_from_v64_S (a.se);
  r.sf = v8d_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8e_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8e_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8e_from_v64_S (a.s0);
  r.s1 = v8e_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8e_from_v64_S (a.s2);
  r.s3 = v8e_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8e_from_v64_S (a.s4);
  r.s5 = v8e_from_v64_S (a.s5);
  r.s6 = v8e_from_v64_S (a.s6);
  r.s7 = v8e_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8e_from_v64_S (a.s8);
  r.s9 = v8e_from_v64_S (a.s9);
  r.sa = v8e_from_v64_S (a.sa);
  r.sb = v8e_from_v64_S (a.sb);
  r.sc = v8e_from_v64_S (a.sc);
  r.sd = v8e_from_v64_S (a.sd);
  r.se = v8e_from_v64_S (a.se);
  r.sf = v8e_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8f_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8f_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8f_from_v64_S (a.s0);
  r.s1 = v8f_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8f_from_v64_S (a.s2);
  r.s3 = v8f_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8f_from_v64_S (a.s4);
  r.s5 = v8f_from_v64_S (a.s5);
  r.s6 = v8f_from_v64_S (a.s6);
  r.s7 = v8f_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8f_from_v64_S (a.s8);
  r.s9 = v8f_from_v64_S (a.s9);
  r.sa = v8f_from_v64_S (a.sa);
  r.sb = v8f_from_v64_S (a.sb);
  r.sc = v8f_from_v64_S (a.sc);
  r.sd = v8f_from_v64_S (a.sd);
  r.se = v8f_from_v64_S (a.se);
  r.sf = v8f_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8g_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8g_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8g_from_v64_S (a.s0);
  r.s1 = v8g_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8g_from_v64_S (a.s2);
  r.s3 = v8g_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8g_from_v64_S (a.s4);
  r.s5 = v8g_from_v64_S (a.s5);
  r.s6 = v8g_from_v64_S (a.s6);
  r.s7 = v8g_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8g_from_v64_S (a.s8);
  r.s9 = v8g_from_v64_S (a.s9);
  r.sa = v8g_from_v64_S (a.sa);
  r.sb = v8g_from_v64_S (a.sb);
  r.sc = v8g_from_v64_S (a.sc);
  r.sd = v8g_from_v64_S (a.sd);
  r.se = v8g_from_v64_S (a.se);
  r.sf = v8g_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u8x v8h_from_v64 (u64x a)
{
  u8x r = 0;

  #if VECT_SIZE == 1
  r    = v8h_from_v64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v8h_from_v64_S (a.s0);
  r.s1 = v8h_from_v64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v8h_from_v64_S (a.s2);
  r.s3 = v8h_from_v64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v8h_from_v64_S (a.s4);
  r.s5 = v8h_from_v64_S (a.s5);
  r.s6 = v8h_from_v64_S (a.s6);
  r.s7 = v8h_from_v64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v8h_from_v64_S (a.s8);
  r.s9 = v8h_from_v64_S (a.s9);
  r.sa = v8h_from_v64_S (a.sa);
  r.sb = v8h_from_v64_S (a.sb);
  r.sc = v8h_from_v64_S (a.sc);
  r.sd = v8h_from_v64_S (a.sd);
  r.se = v8h_from_v64_S (a.se);
  r.sf = v8h_from_v64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u16 v16a_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v16.a;
}

DECLSPEC u16 v16b_from_v32_S (const u32 v32)
{
  vconv32_t v;

  v.v32 = v32;

  return v.v16.b;
}

DECLSPEC u32 v32_from_v16ab_S (const u16 v16a, const u16 v16b)
{
  vconv32_t v;

  v.v16.a = v16a;
  v.v16.b = v16b;

  return v.v32;
}

DECLSPEC u32 v32a_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v32.a;
}

DECLSPEC u32 v32b_from_v64_S (const u64 v64)
{
  vconv64_t v;

  v.v64 = v64;

  return v.v32.b;
}

DECLSPEC u64 v64_from_v32ab_S (const u32 v32a, const u32 v32b)
{
  vconv64_t v;

  v.v32.a = v32a;
  v.v32.b = v32b;

  return v.v64;
}

// unpack function are similar, but always return u32

DECLSPEC u32x unpack_v8a_from_v32 (const u32x v32)
{
  u32x r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1

  #if VECT_SIZE == 1
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r) : "r"(v32));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s0) : "r"(v32.s0));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s1) : "r"(v32.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s2) : "r"(v32.s2));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s3) : "r"(v32.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s4) : "r"(v32.s4));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s5) : "r"(v32.s5));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s6) : "r"(v32.s6));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s7) : "r"(v32.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s8) : "r"(v32.s8));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.s9) : "r"(v32.s9));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.sa) : "r"(v32.sa));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.sb) : "r"(v32.sb));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.sc) : "r"(v32.sc));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.sd) : "r"(v32.sd));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.se) : "r"(v32.se));
  asm volatile ("bfe.u32 %0, %1,  0, 8;" : "=r"(r.sf) : "r"(v32.sf));
  #endif

  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 0, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 0) & 0xff;
  #endif

  return r;
}

DECLSPEC u32x unpack_v8b_from_v32 (const u32x v32)
{
  u32x r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1

  #if VECT_SIZE == 1
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r) : "r"(v32));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s0) : "r"(v32.s0));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s1) : "r"(v32.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s2) : "r"(v32.s2));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s3) : "r"(v32.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s4) : "r"(v32.s4));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s5) : "r"(v32.s5));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s6) : "r"(v32.s6));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s7) : "r"(v32.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s8) : "r"(v32.s8));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.s9) : "r"(v32.s9));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.sa) : "r"(v32.sa));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.sb) : "r"(v32.sb));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.sc) : "r"(v32.sc));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.sd) : "r"(v32.sd));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.se) : "r"(v32.se));
  asm volatile ("bfe.u32 %0, %1,  8, 8;" : "=r"(r.sf) : "r"(v32.sf));
  #endif

  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 8, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 8) & 0xff;
  #endif

  return r;
}

DECLSPEC u32x unpack_v8c_from_v32 (const u32x v32)
{
  u32x r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1

  #if VECT_SIZE == 1
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r) : "r"(v32));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s0) : "r"(v32.s0));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s1) : "r"(v32.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s2) : "r"(v32.s2));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s3) : "r"(v32.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s4) : "r"(v32.s4));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s5) : "r"(v32.s5));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s6) : "r"(v32.s6));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s7) : "r"(v32.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s8) : "r"(v32.s8));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.s9) : "r"(v32.s9));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.sa) : "r"(v32.sa));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.sb) : "r"(v32.sb));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.sc) : "r"(v32.sc));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.sd) : "r"(v32.sd));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.se) : "r"(v32.se));
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r.sf) : "r"(v32.sf));
  #endif

  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 16, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 16) & 0xff;
  #endif

  return r;
}

DECLSPEC u32x unpack_v8d_from_v32 (const u32x v32)
{
  u32x r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1

  #if VECT_SIZE == 1
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r) : "r"(v32));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s0) : "r"(v32.s0));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s1) : "r"(v32.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s2) : "r"(v32.s2));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s3) : "r"(v32.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s4) : "r"(v32.s4));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s5) : "r"(v32.s5));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s6) : "r"(v32.s6));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s7) : "r"(v32.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s8) : "r"(v32.s8));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.s9) : "r"(v32.s9));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.sa) : "r"(v32.sa));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.sb) : "r"(v32.sb));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.sc) : "r"(v32.sc));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.sd) : "r"(v32.sd));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.se) : "r"(v32.se));
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r.sf) : "r"(v32.sf));
  #endif

  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 24, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 24) & 0xff;
  #endif

  return r;
}

DECLSPEC u32 unpack_v8a_from_v32_S (const u32 v32)
{
  u32 r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1
  asm volatile ("bfe.u32 %0, %1, 0, 8;" : "=r"(r) : "r"(v32));
  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 0, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 0) & 0xff;
  #endif

  return r;
}

DECLSPEC u32 unpack_v8b_from_v32_S (const u32 v32)
{
  u32 r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1
  asm volatile ("bfe.u32 %0, %1, 8, 8;" : "=r"(r) : "r"(v32));
  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 8, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 8) & 0xff;
  #endif

  return r;
}

DECLSPEC u32 unpack_v8c_from_v32_S (const u32 v32)
{
  u32 r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1
  asm volatile ("bfe.u32 %0, %1, 16, 8;" : "=r"(r) : "r"(v32));
  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 16, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 16) & 0xff;
  #endif

  return r;
}

DECLSPEC u32 unpack_v8d_from_v32_S (const u32 v32)
{
  u32 r = 0;

  #if   defined IS_NV  && HAS_BFE  == 1
  asm volatile ("bfe.u32 %0, %1, 24, 8;" : "=r"(r) : "r"(v32));
  //#elif (defined IS_AMD || defined IS_HIP) && HAS_VBFE == 1
  //__asm__ __volatile__ ("V_BFE_U32 %0, %1, 24, 8;" : "=v"(r) : "v"(v32));
  #else
  r = (v32 >> 24) & 0xff;
  #endif

  return r;
}

DECLSPEC u32 l32_from_64_S (u64 a)
{
  return v32a_from_v64_S (a);
}

DECLSPEC u32 h32_from_64_S (u64 a)
{
  return v32b_from_v64_S (a);
}

DECLSPEC u64 hl32_to_64_S (const u32 a, const u32 b)
{
  return v64_from_v32ab_S (b, a);
}

DECLSPEC u32x l32_from_64 (u64x a)
{
  u32x r = 0;

  #if VECT_SIZE == 1
  r    = l32_from_64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = l32_from_64_S (a.s0);
  r.s1 = l32_from_64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = l32_from_64_S (a.s2);
  r.s3 = l32_from_64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = l32_from_64_S (a.s4);
  r.s5 = l32_from_64_S (a.s5);
  r.s6 = l32_from_64_S (a.s6);
  r.s7 = l32_from_64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = l32_from_64_S (a.s8);
  r.s9 = l32_from_64_S (a.s9);
  r.sa = l32_from_64_S (a.sa);
  r.sb = l32_from_64_S (a.sb);
  r.sc = l32_from_64_S (a.sc);
  r.sd = l32_from_64_S (a.sd);
  r.se = l32_from_64_S (a.se);
  r.sf = l32_from_64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u32x h32_from_64 (u64x a)
{
  u32x r = 0;

  #if VECT_SIZE == 1
  r    = h32_from_64_S (a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = h32_from_64_S (a.s0);
  r.s1 = h32_from_64_S (a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = h32_from_64_S (a.s2);
  r.s3 = h32_from_64_S (a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = h32_from_64_S (a.s4);
  r.s5 = h32_from_64_S (a.s5);
  r.s6 = h32_from_64_S (a.s6);
  r.s7 = h32_from_64_S (a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = h32_from_64_S (a.s8);
  r.s9 = h32_from_64_S (a.s9);
  r.sa = h32_from_64_S (a.sa);
  r.sb = h32_from_64_S (a.sb);
  r.sc = h32_from_64_S (a.sc);
  r.sd = h32_from_64_S (a.sd);
  r.se = h32_from_64_S (a.se);
  r.sf = h32_from_64_S (a.sf);
  #endif

  return r;
}

DECLSPEC u64x hl32_to_64 (const u32x a, const u32x b)
{
  u64x r;

  #if VECT_SIZE == 1
  r    = v64_from_v32ab_S  (b   , a);
  #endif

  #if VECT_SIZE >= 2
  r.s0 = v64_from_v32ab_S  (b.s0, a.s0);
  r.s1 = v64_from_v32ab_S  (b.s1, a.s1);
  #endif

  #if VECT_SIZE >= 4
  r.s2 = v64_from_v32ab_S  (b.s2, a.s2);
  r.s3 = v64_from_v32ab_S  (b.s3, a.s3);
  #endif

  #if VECT_SIZE >= 8
  r.s4 = v64_from_v32ab_S  (b.s4, a.s4);
  r.s5 = v64_from_v32ab_S  (b.s5, a.s5);
  r.s6 = v64_from_v32ab_S  (b.s6, a.s6);
  r.s7 = v64_from_v32ab_S  (b.s7, a.s7);
  #endif

  #if VECT_SIZE >= 16
  r.s8 = v64_from_v32ab_S  (b.s8, a.s8);
  r.s9 = v64_from_v32ab_S  (b.s9, a.s9);
  r.sa = v64_from_v32ab_S  (b.sa, a.sa);
  r.sb = v64_from_v32ab_S  (b.sb, a.sb);
  r.sc = v64_from_v32ab_S  (b.sc, a.sc);
  r.sd = v64_from_v32ab_S  (b.sd, a.sd);
  r.se = v64_from_v32ab_S  (b.se, a.se);
  r.sf = v64_from_v32ab_S  (b.sf, a.sf);
  #endif

  return r;
}

// bit rotates
//
// For _CPU_OPENCL_EMU_H we dont need to care about vector functions
// The VECT_SIZE is guaranteed to be set to 1 from cpu_opencl_emu.h

DECLSPEC u32x hc_rotl32 (const u32x a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotl32 (a, n);
  #elif defined IS_CUDA || defined IS_HIP
  return rotl32 (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, make_u32x (n));
  #else
  return ((a << n) | (a >> (32 - n)));
  #endif
  #endif
}

DECLSPEC u32x hc_rotr32 (const u32x a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotr32 (a, n);
  #elif defined IS_CUDA || defined IS_HIP
  return rotr32 (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, make_u32x (32 - n));
  #else
  return ((a >> n) | (a << (32 - n)));
  #endif
  #endif
}

DECLSPEC u32 hc_rotl32_S (const u32 a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotl32 (a, n);
  #elif defined IS_CUDA || defined IS_HIP
  return rotl32_S (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, (u32) (n));
  #else
  return ((a << n) | (a >> (32 - n)));
  #endif
  #endif
}

DECLSPEC u32 hc_rotr32_S (const u32 a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotr32 (a, n);
  #elif defined IS_CUDA || defined IS_HIP
  return rotr32_S (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, (u32) (32 - n));
  #else
  return ((a >> n) | (a << (32 - n)));
  #endif
  #endif
}

DECLSPEC u64x hc_rotl64 (const u64x a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotl64 (a, n);
  #elif defined IS_CUDA
  return rotl64 (a, n);
  #elif (defined IS_AMD || defined IS_HIP)
  return rotl64 (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, make_u64x (n));
  #else
  return ((a << n) | (a >> (64 - n)));
  #endif
  #endif
}

DECLSPEC u64x hc_rotr64 (const u64x a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotr64 (a, n);
  #elif defined IS_CUDA
  return rotr64 (a, n);
  #elif (defined IS_AMD || defined IS_HIP)
  return rotr64 (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, make_u64x (64 - n));
  #else
  return ((a >> n) | (a << (64 - n)));
  #endif
  #endif
}

DECLSPEC u64 hc_rotl64_S (const u64 a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotl64 (a, n);
  #elif defined IS_CUDA
  return rotl64_S (a, n);
  #elif (defined IS_AMD || defined IS_HIP)
  return rotl64_S (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, (u64) (n));
  #else
  return ((a << n) | (a >> (64 - n)));
  #endif
  #endif
}

DECLSPEC u64 hc_rotr64_S (const u64 a, const int n)
{
  #if   defined _CPU_OPENCL_EMU_H
  return rotr64 (a, n);
  #elif defined IS_CUDA
  return rotr64_S (a, n);
  #elif (defined IS_AMD || defined IS_HIP)
  return rotr64_S (a, n);
  #else
  #ifdef USE_ROTATE
  return rotate (a, (u64) (64 - n));
  #else
  return ((a >> n) | (a << (64 - n)));
  #endif
  #endif
}

// bitwise swap

DECLSPEC u32x hc_swap32 (const u32x v)
{
  u32x r;

  #ifdef _CPU_OPENCL_EMU_H
  r = byte_swap_32 (v);
  #else
  #if   (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1

  const u32 m = 0x00010203;

  #if VECT_SIZE == 1
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(m));
  #endif

  #if VECT_SIZE >= 2
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s0) : "v"(v.s0), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s1) : "v"(v.s1), "v"(m));
  #endif

  #if VECT_SIZE >= 4
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s2) : "v"(v.s2), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s3) : "v"(v.s3), "v"(m));
  #endif

  #if VECT_SIZE >= 8
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s4) : "v"(v.s4), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s5) : "v"(v.s5), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s6) : "v"(v.s6), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s7) : "v"(v.s7), "v"(m));
  #endif

  #if VECT_SIZE >= 16
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s8) : "v"(v.s8), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s9) : "v"(v.s9), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sa) : "v"(v.sa), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sb) : "v"(v.sb), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sc) : "v"(v.sc), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sd) : "v"(v.sd), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.se) : "v"(v.se), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sf) : "v"(v.sf), "v"(m));
  #endif

  #elif defined IS_NV  && HAS_PRMT  == 1

  #if VECT_SIZE == 1
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s0) : "r"(v.s0));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s1) : "r"(v.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s2) : "r"(v.s2));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s3) : "r"(v.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s4) : "r"(v.s4));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s5) : "r"(v.s5));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s6) : "r"(v.s6));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s7) : "r"(v.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s8) : "r"(v.s8));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.s9) : "r"(v.s9));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.sa) : "r"(v.sa));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.sb) : "r"(v.sb));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.sc) : "r"(v.sc));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.sd) : "r"(v.sd));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.se) : "r"(v.se));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r.sf) : "r"(v.sf));
  #endif

  #else

  #if defined USE_BITSELECT && defined USE_ROTATE
  r = bitselect (rotate (v, make_u32x (24)),
                 rotate (v, make_u32x ( 8)),
                            make_u32x (0x00ff00ff));
  #else
  r = ((v & make_u32x (0xff000000)) >> 24)
    | ((v & make_u32x (0x00ff0000)) >>  8)
    | ((v & make_u32x (0x0000ff00)) <<  8)
    | ((v & make_u32x (0x000000ff)) << 24);
  #endif

  #endif

  #endif

  return r;
}

DECLSPEC u32 hc_swap32_S (const u32 v)
{
  u32 r;

  #ifdef _CPU_OPENCL_EMU_H
  r = byte_swap_32 (v);
  #else
  #if   (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203));
  #elif defined IS_NV  && HAS_PRMT  == 1
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(r) : "r"(v));
  #else
  #ifdef USE_SWIZZLE
  r = as_uint (as_uchar4 (v).s3210);
  #else
  r = ((v & 0xff000000) >> 24)
    | ((v & 0x00ff0000) >>  8)
    | ((v & 0x0000ff00) <<  8)
    | ((v & 0x000000ff) << 24);
  #endif
  #endif
  #endif

  return r;
}

DECLSPEC u64x hc_swap64 (const u64x v)
{
  u64x r;

  #ifdef _CPU_OPENCL_EMU_H
  r = byte_swap_64 (v);
  #else
  #if   (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1

  const u32 m = 0x00010203;

  const u32x a0 = h32_from_64 (v);
  const u32x a1 = l32_from_64 (v);

  u32x t0;
  u32x t1;

  #if VECT_SIZE == 1
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0) : "v"(0), "v"(a0), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1) : "v"(0), "v"(a1), "v"(m));
  #endif

  #if VECT_SIZE >= 2
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s0) : "v"(0), "v"(a0.s0), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s0) : "v"(0), "v"(a1.s0), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s1) : "v"(0), "v"(a0.s1), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s1) : "v"(0), "v"(a1.s1), "v"(m));
  #endif

  #if VECT_SIZE >= 4
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s2) : "v"(0), "v"(a0.s2), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s2) : "v"(0), "v"(a1.s2), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s3) : "v"(0), "v"(a0.s3), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s3) : "v"(0), "v"(a1.s3), "v"(m));
  #endif

  #if VECT_SIZE >= 8
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s4) : "v"(0), "v"(a0.s4), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s4) : "v"(0), "v"(a1.s4), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s5) : "v"(0), "v"(a0.s5), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s5) : "v"(0), "v"(a1.s5), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s6) : "v"(0), "v"(a0.s6), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s6) : "v"(0), "v"(a1.s6), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s7) : "v"(0), "v"(a0.s7), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s7) : "v"(0), "v"(a1.s7), "v"(m));
  #endif

  #if VECT_SIZE >= 16
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s8) : "v"(0), "v"(a0.s8), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s8) : "v"(0), "v"(a1.s8), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s9) : "v"(0), "v"(a0.s9), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s9) : "v"(0), "v"(a1.s9), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sa) : "v"(0), "v"(a0.sa), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sa) : "v"(0), "v"(a1.sa), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sb) : "v"(0), "v"(a0.sb), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sb) : "v"(0), "v"(a1.sb), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sc) : "v"(0), "v"(a0.sc), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sc) : "v"(0), "v"(a1.sc), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sd) : "v"(0), "v"(a0.sd), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sd) : "v"(0), "v"(a1.sd), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.se) : "v"(0), "v"(a0.se), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.se) : "v"(0), "v"(a1.se), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sf) : "v"(0), "v"(a0.sf), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sf) : "v"(0), "v"(a1.sf), "v"(m));
  #endif

  r = hl32_to_64 (t1, t0);

  #elif defined IS_NV && HAS_MOV64 == 1 && HAS_PRMT == 1

  u32x il;
  u32x ir;

  #if VECT_SIZE == 1
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s0), "=r"(ir.s0) : "l"(v.s0));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s1), "=r"(ir.s1) : "l"(v.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s2), "=r"(ir.s2) : "l"(v.s2));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s3), "=r"(ir.s3) : "l"(v.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s4), "=r"(ir.s4) : "l"(v.s4));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s5), "=r"(ir.s5) : "l"(v.s5));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s6), "=r"(ir.s6) : "l"(v.s6));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s7), "=r"(ir.s7) : "l"(v.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s8), "=r"(ir.s8) : "l"(v.s8));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.s9), "=r"(ir.s9) : "l"(v.s9));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.sa), "=r"(ir.sa) : "l"(v.sa));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.sb), "=r"(ir.sb) : "l"(v.sb));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.sc), "=r"(ir.sc) : "l"(v.sc));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.sd), "=r"(ir.sd) : "l"(v.sd));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.se), "=r"(ir.se) : "l"(v.se));
  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il.sf), "=r"(ir.sf) : "l"(v.sf));
  #endif

  u32x tl;
  u32x tr;

  #if VECT_SIZE == 1
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s0) : "r"(il.s0));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s0) : "r"(ir.s0));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s1) : "r"(il.s1));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s1) : "r"(ir.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s2) : "r"(il.s2));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s2) : "r"(ir.s2));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s3) : "r"(il.s3));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s3) : "r"(ir.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s4) : "r"(il.s4));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s4) : "r"(ir.s4));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s5) : "r"(il.s5));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s5) : "r"(ir.s5));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s6) : "r"(il.s6));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s6) : "r"(ir.s6));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s7) : "r"(il.s7));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s7) : "r"(ir.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s8) : "r"(il.s8));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s8) : "r"(ir.s8));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.s9) : "r"(il.s9));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.s9) : "r"(ir.s9));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.sa) : "r"(il.sa));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.sa) : "r"(ir.sa));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.sb) : "r"(il.sb));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.sb) : "r"(ir.sb));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.sc) : "r"(il.sc));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.sc) : "r"(ir.sc));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.sd) : "r"(il.sd));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.sd) : "r"(ir.sd));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.se) : "r"(il.se));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.se) : "r"(ir.se));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl.sf) : "r"(il.sf));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr.sf) : "r"(ir.sf));
  #endif

  #if VECT_SIZE == 1
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tr.s0), "r"(tl.s0));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tr.s1), "r"(tl.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tr.s2), "r"(tl.s2));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tr.s3), "r"(tl.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tr.s4), "r"(tl.s4));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tr.s5), "r"(tl.s5));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tr.s6), "r"(tl.s6));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tr.s7), "r"(tl.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s8) : "r"(tr.s8), "r"(tl.s8));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.s9) : "r"(tr.s9), "r"(tl.s9));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.sa) : "r"(tr.sa), "r"(tl.sa));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.sb) : "r"(tr.sb), "r"(tl.sb));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.sc) : "r"(tr.sc), "r"(tl.sc));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.sd) : "r"(tr.sd), "r"(tl.sd));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.se) : "r"(tr.se), "r"(tl.se));
  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r.sf) : "r"(tr.sf), "r"(tl.sf));
  #endif

  #else

  #if defined USE_BITSELECT && defined USE_ROTATE

  r = bitselect (bitselect (rotate (v, make_u64x (24)),
                            rotate (v, make_u64x ( 8)),
                                       make_u64x (0x000000ff000000ffUL)),
                 bitselect (rotate (v, make_u64x (56)),
                            rotate (v, make_u64x (40)),
                                       make_u64x (0x00ff000000ff0000UL)),
                                       make_u64x (0xffff0000ffff0000UL));
  #else

  r = ((v & make_u64x (0xff00000000000000UL)) >> 56)
    | ((v & make_u64x (0x00ff000000000000UL)) >> 40)
    | ((v & make_u64x (0x0000ff0000000000UL)) >> 24)
    | ((v & make_u64x (0x000000ff00000000UL)) >>  8)
    | ((v & make_u64x (0x00000000ff000000UL)) <<  8)
    | ((v & make_u64x (0x0000000000ff0000UL)) << 24)
    | ((v & make_u64x (0x000000000000ff00UL)) << 40)
    | ((v & make_u64x (0x00000000000000ffUL)) << 56);

  #endif

  #endif
  #endif

  return r;
}

DECLSPEC u64 hc_swap64_S (const u64 v)
{
  u64 r;

  #ifdef _CPU_OPENCL_EMU_H
  r = byte_swap_64 (v);
  #else
  #if   (defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1
  const u32 m = 0x00010203;

  const u32 v0 = h32_from_64_S (v);
  const u32 v1 = l32_from_64_S (v);

  u32 t0;
  u32 t1;

  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t0) : "v"(v0), "v"(m));
  __asm__ __volatile__ ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t1) : "v"(v1), "v"(m));

  r = hl32_to_64_S (t1, t0);
  #elif defined IS_NV  && HAS_PRMT  == 1
  u32 il;
  u32 ir;

  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(v));

  u32 tl;
  u32 tr;

  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tl) : "r"(il));
  asm volatile ("prmt.b32 %0, %1, 0, 0x0123;" : "=r"(tr) : "r"(ir));

  asm volatile ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tr), "r"(tl));
  #else
  #ifdef USE_SWIZZLE
  r = as_ulong (as_uchar8 (v).s76543210);
  #else
  r = ((v & (u64) 0xff00000000000000UL) >> 56)
    | ((v & (u64) 0x00ff000000000000UL) >> 40)
    | ((v & (u64) 0x0000ff0000000000UL) >> 24)
    | ((v & (u64) 0x000000ff00000000UL) >>  8)
    | ((v & (u64) 0x00000000ff000000UL) <<  8)
    | ((v & (u64) 0x0000000000ff0000UL) << 24)
    | ((v & (u64) 0x000000000000ff00UL) << 40)
    | ((v & (u64) 0x00000000000000ffUL) << 56);
  #endif
  #endif
  #endif

  return r;
}

#if (defined IS_AMD || defined IS_HIP)

DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
  #define BIT(x)      (make_u32x (1u) << (x))
  #define BIT_MASK(x) (BIT (x) - 1)
  #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))

  return BFE (a, b, c);

  #undef BIT
  #undef BIT_MASK
  #undef BFE
}

DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
  #define BIT(x)      (1u << (x))
  #define BIT_MASK(x) (BIT (x) - 1)
  #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))

  return BFE (a, b, c);

  #undef BIT
  #undef BIT_MASK
  #undef BFE
}

DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a << 24) | (b >>  8); }
  else if (cm == 2) { r = (a << 16) | (b >> 16); }
  else if (cm == 3) { r = (a <<  8) | (b >> 24); }

  return r;
}

DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a << 24) | (b >>  8); }
  else if (cm == 2) { r = (a << 16) | (b >> 16); }
  else if (cm == 3) { r = (a <<  8) | (b >> 24); }

  return r;
}

DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a >> 24) | (b <<  8); }
  else if (cm == 2) { r = (a >> 16) | (b << 16); }
  else if (cm == 3) { r = (a >>  8) | (b << 24); }

  return r;
}

DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a >> 24) | (b <<  8); }
  else if (cm == 2) { r = (a >> 16) | (b << 16); }
  else if (cm == 3) { r = (a >>  8) | (b << 24); }

  return r;
}

#if HAS_VPERM
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  #if VECT_SIZE == 1
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
  #endif

  #if VECT_SIZE >= 2
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c));
  #endif

  #if VECT_SIZE >= 4
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c));
  #endif

  #if VECT_SIZE >= 8
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c));
  #endif

  #if VECT_SIZE >= 16
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c));
  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c));
  #endif

  return r;
}

DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  __asm__ __volatile__ ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));

  return r;
}
#endif

#if HAS_VADD3
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
  /*
  u32x r = 0;

  #if VECT_SIZE == 1
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
  #endif

  #if VECT_SIZE >= 2
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
  #endif

  #if VECT_SIZE >= 4
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
  #endif

  #if VECT_SIZE >= 8
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
  #endif

  #if VECT_SIZE >= 16
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se));
  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf));
  #endif

  return r;
  */

  return a + b + c;
}

DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
  /*
  u32 r = 0;

  __asm__ __volatile__ ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));

  return r;
  */

  return a + b + c;
}
#else
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
  return a + b + c;
}

DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
  return a + b + c;
}
#endif

DECLSPEC u32x hc_lop_0x96 (const u32x a, const u32x b, const u32x c)
{
  return a ^ b ^ c;
}

DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
{
  return a ^ b ^ c;
}

#endif

#ifdef IS_NV

DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  #if VECT_SIZE == 1
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r)    : "r"(a),    "r"(b),    "r"(c));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c));
  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c));
  #endif

  return r;
}

DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  asm volatile ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));

  return r;
}

DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
  u32x r = 0;

  #if VECT_SIZE == 1
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r)    : "r"(a),    "r"(b),    "r"(c));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c.s8));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c.s9));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c.sa));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c.sb));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c.sc));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c.sd));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c.se));
  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c.sf));
  #endif

  return r;
}

DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
  u32 r = 0;

  asm volatile ("bfe.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));

  return r;
}

DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
  const int c_mod_4 = c & 3;

  const int c_minus_4 = 4 - c_mod_4;

  const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);

  return r;
}

DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
  const int c_mod_4 = c & 3;

  const int c_minus_4 = 4 - c_mod_4;

  const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);

  return r;
}

DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
  return a + b + c;
}

DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
  return a + b + c;
}

DECLSPEC u32x hc_lop_0x96 (const u32x a, const u32x b, const u32x c)
{
  u32x r = 0;

  #if CUDA_ARCH >= 500

  #if VECT_SIZE == 1
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r): "r"(a), "r"(b), "r"(c));
  #endif

  #if VECT_SIZE >= 2
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s0): "r"(a.s0), "r"(b.s0), "r"(c.s0));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s1): "r"(a.s1), "r"(b.s1), "r"(c.s1));
  #endif

  #if VECT_SIZE >= 4
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s2): "r"(a.s2), "r"(b.s2), "r"(c.s2));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s3): "r"(a.s3), "r"(b.s3), "r"(c.s3));
  #endif

  #if VECT_SIZE >= 8
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s4): "r"(a.s4), "r"(b.s4), "r"(c.s4));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s5): "r"(a.s5), "r"(b.s5), "r"(c.s5));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s6): "r"(a.s6), "r"(b.s6), "r"(c.s6));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s7): "r"(a.s7), "r"(b.s7), "r"(c.s7));
  #endif

  #if VECT_SIZE >= 16
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s8): "r"(a.s8), "r"(b.s8), "r"(c.s8));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.s9): "r"(a.s9), "r"(b.s9), "r"(c.s9));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.sa): "r"(a.sa), "r"(b.sa), "r"(c.sa));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.sb): "r"(a.sb), "r"(b.sb), "r"(c.sb));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.sc): "r"(a.sc), "r"(b.sc), "r"(c.sc));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.sd): "r"(a.sd), "r"(b.sd), "r"(c.sd));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.se): "r"(a.se), "r"(b.se), "r"(c.se));
  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r.sf): "r"(a.sf), "r"(b.sf), "r"(c.sf));
  #endif

  #else

  r = a ^ b ^ c;

  #endif

  return r;
}

DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
{
  u32 r = 0;

  #if CUDA_ARCH >= 500

  asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(r): "r"(a), "r"(b), "r"(c));

  #else

  r = a ^ b ^ c;

  #endif

  return r;
}

#endif

#ifdef IS_GENERIC

DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
  #define BIT(x)      (make_u32x (1u) << (x))
  #define BIT_MASK(x) (BIT (x) - 1)
  #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))

  return BFE (a, b, c);

  #undef BIT
  #undef BIT_MASK
  #undef BFE
}

DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
  #define BIT(x)      (1u << (x))
  #define BIT_MASK(x) (BIT (x) - 1)
  #define BFE(x,y,z)  (((x) >> (y)) & BIT_MASK (z))

  return BFE (a, b, c);

  #undef BIT
  #undef BIT_MASK
  #undef BFE
}

DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a << 24) | (b >>  8); }
  else if (cm == 2) { r = (a << 16) | (b >> 16); }
  else if (cm == 3) { r = (a <<  8) | (b >> 24); }

  return r;
}

DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a << 24) | (b >>  8); }
  else if (cm == 2) { r = (a << 16) | (b >> 16); }
  else if (cm == 3) { r = (a <<  8) | (b >> 24); }

  return r;
}

DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
  u32x r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a >> 24) | (b <<  8); }
  else if (cm == 2) { r = (a >> 16) | (b << 16); }
  else if (cm == 3) { r = (a >>  8) | (b << 24); }

  return r;
}

DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
  u32 r = 0;

  const int cm = c & 3;

       if (cm == 0) { r = b;                     }
  else if (cm == 1) { r = (a >> 24) | (b <<  8); }
  else if (cm == 2) { r = (a >> 16) | (b << 16); }
  else if (cm == 3) { r = (a >>  8) | (b << 24); }

  return r;
}

DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
  return a + b + c;
}

DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
  return a + b + c;
}

DECLSPEC u32x hc_lop_0x96 (const u32x a, const u32x b, const u32x c)
{
  return a ^ b ^ c;
}

DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
{
  return a ^ b ^ c;
}

#endif

/**
 * pure scalar functions
 */

DECLSPEC int ffz (const u32 v)
{
  #ifdef _unroll
  #pragma unroll
  #endif
  for (int i = 0; i < 32; i++)
  {
    if ((v >> i) & 1) continue;

    return i;
  }

  return -1;
}

#ifdef KERNEL_STATIC
DECLSPEC int hash_comp (const u32 *d1, GLOBAL_AS const u32 *d2)
{
  if (d1[3] > d2[DGST_R3]) return ( 1);
  if (d1[3] < d2[DGST_R3]) return (-1);
  if (d1[2] > d2[DGST_R2]) return ( 1);
  if (d1[2] < d2[DGST_R2]) return (-1);
  if (d1[1] > d2[DGST_R1]) return ( 1);
  if (d1[1] < d2[DGST_R1]) return (-1);
  if (d1[0] > d2[DGST_R0]) return ( 1);
  if (d1[0] < d2[DGST_R0]) return (-1);

  return (0);
}

DECLSPEC int find_hash (const u32 *digest, const u32 digests_cnt, GLOBAL_AS const digest_t *digests_buf)
{
  for (u32 l = 0, r = digests_cnt; r; r >>= 1)
  {
    const u32 m = r >> 1;

    const u32 c = l + m;

    const int cmp = hash_comp (digest, digests_buf[c].digest_buf);

    if (cmp > 0)
    {
      l += m + 1;

      r--;
    }

    if (cmp == 0) return (c);
  }

  return (-1);
}
#endif

// Input has to be zero padded and buffer size has to be multiple of 4 and at least of length 24
// We simply ignore buffer length for the first 24 bytes for some extra speed boost :)
// Number of unrolls found by simply testing what gave best results

DECLSPEC int hc_enc_scan (const u32 *buf, const int len)
{
  if (buf[0] & 0x80808080) return 1;
  if (buf[1] & 0x80808080) return 1;
  if (buf[2] & 0x80808080) return 1;
  if (buf[3] & 0x80808080) return 1;
  if (buf[4] & 0x80808080) return 1;
  if (buf[5] & 0x80808080) return 1;

  for (int i = 24, j = 6; i < len; i += 4, j += 1)
  {
    if (buf[j] & 0x80808080) return 1;
  }

  return 0;
}

DECLSPEC int hc_enc_scan_global (GLOBAL_AS const u32 *buf, const int len)
{
  if (buf[0] & 0x80808080) return 1;
  if (buf[1] & 0x80808080) return 1;
  if (buf[2] & 0x80808080) return 1;
  if (buf[3] & 0x80808080) return 1;
  if (buf[4] & 0x80808080) return 1;
  if (buf[5] & 0x80808080) return 1;

  for (int i = 24, j = 6; i < len; i += 4, j += 1)
  {
    if (buf[j] & 0x80808080) return 1;
  }

  return 0;
}

// Constants and some code snippets from unicode.org's ConvertUTF.c
// Compiler can perfectly translate some of the branches and switch cases this into MOVC
// which is faster than lookup tables

#define halfShift 10

#define halfBase 0x0010000
#define halfMask 0x3FF

#define UNI_MAX_BMP          0xFFFF
#define UNI_SUR_HIGH_START   0xD800
#define UNI_SUR_HIGH_END     0xDBFF
#define UNI_SUR_LOW_START    0xDC00
#define UNI_SUR_LOW_END      0xDFFF

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */

#define offsetsFromUTF8_0 0x00000000UL
#define offsetsFromUTF8_1 0x00003080UL
#define offsetsFromUTF8_2 0x000E2080UL
#define offsetsFromUTF8_3 0x03C82080UL
#define offsetsFromUTF8_4 0xFA082080UL
#define offsetsFromUTF8_5 0x82082080UL

DECLSPEC void hc_enc_init (hc_enc_t *hc_enc)
{
  hc_enc->pos = 0;

  hc_enc->cbuf = 0;
  hc_enc->clen = 0;
}

DECLSPEC int hc_enc_has_next (hc_enc_t *hc_enc, const int sz)
{
  if (hc_enc->pos < sz) return 1;

  if (hc_enc->clen) return 1;

  return 0;
}

// Input buffer and Output buffer size has to be multiple of 4 and at least of size 4.
// The output buffer is not zero padded, so entire buffer has to be set all zero before entering this function or truncated afterwards.

DECLSPEC int hc_enc_next (hc_enc_t *hc_enc, const u32 *src_buf, const int src_len, const int src_sz, u32 *dst_buf, const int dst_sz)
{
  const u8 *src_ptr = (const u8 *) src_buf;
        u8 *dst_ptr = (      u8 *) dst_buf;

  int src_pos = hc_enc->pos;

  int dst_pos = hc_enc->clen;

  dst_buf[0] = hc_enc->cbuf;

  hc_enc->clen = 0;
  hc_enc->cbuf = 0;

  while ((src_pos < src_len) && (dst_pos < dst_sz))
  {
    const u8 c = src_ptr[src_pos];

    int extraBytesToRead = 0;

    if (c >= 0xfc)
    {
      extraBytesToRead = 5;
    }
    else if (c >= 0xf8)
    {
      extraBytesToRead = 4;
    }
    else if (c >= 0xf0)
    {
      extraBytesToRead = 3;
    }
    else if (c >= 0xe0)
    {
      extraBytesToRead = 2;
    }
    else if (c >= 0xc0)
    {
      extraBytesToRead = 1;
    }

    if ((src_pos + extraBytesToRead) >= src_sz)
    {
      // broken input

      hc_enc->pos = src_len;

      return dst_pos;
    }

    u32 ch = 0;

    switch (extraBytesToRead)
    {
      case 5:
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_5;
        break;
      case 4:
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_4;
        break;
      case 3:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_3;
        break;
      case 2:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_2;
        break;
      case 1:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_1;
        break;
      case 0:
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_0;
        break;
    }

    /* Target is a character <= 0xFFFF */
    if (ch <= UNI_MAX_BMP)
    {
      dst_ptr[dst_pos++] = (ch >> 0) & 0xff;
      dst_ptr[dst_pos++] = (ch >> 8) & 0xff;
    }
    else
    {
      ch -= halfBase;

      const u32 a = ((ch >> halfShift) + UNI_SUR_HIGH_START);
      const u32 b = ((ch  & halfMask)  + UNI_SUR_LOW_START);

      if ((dst_pos + 2) == dst_sz)
      {
        dst_ptr[dst_pos++] = (a >> 0) & 0xff;
        dst_ptr[dst_pos++] = (a >> 8) & 0xff;

        hc_enc->cbuf = b & 0xffff;
        hc_enc->clen = 2;
      }
      else
      {
        dst_ptr[dst_pos++] = (a >> 0) & 0xff;
        dst_ptr[dst_pos++] = (a >> 8) & 0xff;
        dst_ptr[dst_pos++] = (b >> 0) & 0xff;
        dst_ptr[dst_pos++] = (b >> 8) & 0xff;
      }
    }
  }

  hc_enc->pos = src_pos;

  return dst_pos;
}

DECLSPEC int hc_enc_next_global (hc_enc_t *hc_enc, GLOBAL_AS const u32 *src_buf, const int src_len, const int src_sz, u32 *dst_buf, const int dst_sz)
{
  GLOBAL_AS const u8 *src_ptr = (GLOBAL_AS const u8 *) src_buf;
                  u8 *dst_ptr = (                u8 *) dst_buf;

  int src_pos = hc_enc->pos;

  int dst_pos = hc_enc->clen;

  dst_buf[0] = hc_enc->cbuf;

  hc_enc->clen = 0;
  hc_enc->cbuf = 0;

  while ((src_pos < src_len) && (dst_pos < dst_sz))
  {
    const u8 c = src_ptr[src_pos];

    int extraBytesToRead = 0;

    if (c >= 0xfc)
    {
      extraBytesToRead = 5;
    }
    else if (c >= 0xf8)
    {
      extraBytesToRead = 4;
    }
    else if (c >= 0xf0)
    {
      extraBytesToRead = 3;
    }
    else if (c >= 0xe0)
    {
      extraBytesToRead = 2;
    }
    else if (c >= 0xc0)
    {
      extraBytesToRead = 1;
    }

    if ((src_pos + extraBytesToRead) >= src_sz)
    {
      // broken input

      hc_enc->pos = src_len;

      return dst_pos;
    }

    u32 ch = 0;

    switch (extraBytesToRead)
    {
      case 5:
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_5;
        break;
      case 4:
        ch += src_ptr[src_pos++]; ch <<= 6; /* remember, illegal UTF-8 */
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_4;
        break;
      case 3:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_3;
        break;
      case 2:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_2;
        break;
      case 1:
        ch += src_ptr[src_pos++]; ch <<= 6;
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_1;
        break;
      case 0:
        ch += src_ptr[src_pos++];
        ch -= offsetsFromUTF8_0;
        break;
    }

    /* Target is a character <= 0xFFFF */
    if (ch <= UNI_MAX_BMP)
    {
      dst_ptr[dst_pos++] = (ch >> 0) & 0xff;
      dst_ptr[dst_pos++] = (ch >> 8) & 0xff;
    }
    else
    {
      ch -= halfBase;

      const u32 a = ((ch >> halfShift) + UNI_SUR_HIGH_START);
      const u32 b = ((ch  & halfMask)  + UNI_SUR_LOW_START);

      if ((dst_pos + 2) == dst_sz)
      {
        dst_ptr[dst_pos++] = (a >> 0) & 0xff;
        dst_ptr[dst_pos++] = (a >> 8) & 0xff;

        hc_enc->cbuf = b & 0xffff;
        hc_enc->clen = 2;
      }
      else
      {
        dst_ptr[dst_pos++] = (a >> 0) & 0xff;
        dst_ptr[dst_pos++] = (a >> 8) & 0xff;
        dst_ptr[dst_pos++] = (b >> 0) & 0xff;
        dst_ptr[dst_pos++] = (b >> 8) & 0xff;
      }
    }
  }

  hc_enc->pos = src_pos;

  return dst_pos;
}

#undef halfShift

#undef halfBase
#undef halfMask

#undef UNI_MAX_BMP
#undef UNI_SUR_HIGH_START
#undef UNI_SUR_HIGH_END
#undef UNI_SUR_LOW_START
#undef UNI_SUR_LOW_END

#undef offsetsFromUTF8_0
#undef offsetsFromUTF8_1
#undef offsetsFromUTF8_2
#undef offsetsFromUTF8_3
#undef offsetsFromUTF8_4
#undef offsetsFromUTF8_5

DECLSPEC int pkcs_padding_bs8 (const u32 *data_buf, const int data_len)
{
  if (data_len == 0) return -1; // cannot have zero length, is important to avoid out of boundary reads

  if (data_len % 8) return -1; // has to be a multiple of block size

  const int last_pad_pos = data_len - 1;

  const int last_pad_elem = last_pad_pos / 4;

  const u32 pad = data_buf[last_pad_elem] >> 24; // guaranteed by pkcs structure

  if ((pad < 1) || (pad > 8)) return -1; // pkcs pads are not zero based

  const u32 padm = (pad <<  0)
                 | (pad <<  8)
                 | (pad << 16)
                 | (pad << 24);

  u32 mask0 = 0;
  u32 mask1 = 0;

  switch (pad)
  {
    case  1:  mask0 = 0x00000000; mask1 = 0xff000000; break;
    case  2:  mask0 = 0x00000000; mask1 = 0xffff0000; break;
    case  3:  mask0 = 0x00000000; mask1 = 0xffffff00; break;
    case  4:  mask0 = 0x00000000; mask1 = 0xffffffff; break;
    case  5:  mask0 = 0xff000000; mask1 = 0xffffffff; break;
    case  6:  mask0 = 0xffff0000; mask1 = 0xffffffff; break;
    case  7:  mask0 = 0xffffff00; mask1 = 0xffffffff; break;
    case  8:  mask0 = 0xffffffff; mask1 = 0xffffffff; break;
  }

  const u32 data0 = data_buf[last_pad_elem - 1];
  const u32 data1 = data_buf[last_pad_elem - 0];

  if ((data0 & mask0) != (padm & mask0)) return -1;
  if ((data1 & mask1) != (padm & mask1)) return -1;

  const int real_len = data_len - pad;

  return real_len;
}

DECLSPEC int pkcs_padding_bs16 (const u32 *data_buf, const int data_len)
{
  if (data_len == 0) return -1; // cannot have zero length, is important to avoid out of boundary reads

  if (data_len % 16) return -1; // has to be a multiple of block size

  const int last_pad_pos = data_len - 1;

  const int last_pad_elem = last_pad_pos / 4;

  const u32 pad = data_buf[last_pad_elem] >> 24; // guaranteed by pkcs structure

  if ((pad < 1) || (pad > 16)) return -1; // pkcs pads are not zero based

  const u32 padm = (pad <<  0)
                 | (pad <<  8)
                 | (pad << 16)
                 | (pad << 24);

  u32 mask0 = 0;
  u32 mask1 = 0;
  u32 mask2 = 0;
  u32 mask3 = 0;

  switch (pad)
  {
    case  1:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0x00000000; mask3 = 0xff000000; break;
    case  2:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0x00000000; mask3 = 0xffff0000; break;
    case  3:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0x00000000; mask3 = 0xffffff00; break;
    case  4:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0x00000000; mask3 = 0xffffffff; break;
    case  5:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0xff000000; mask3 = 0xffffffff; break;
    case  6:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0xffff0000; mask3 = 0xffffffff; break;
    case  7:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0xffffff00; mask3 = 0xffffffff; break;
    case  8:  mask0 = 0x00000000; mask1 = 0x00000000; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case  9:  mask0 = 0x00000000; mask1 = 0xff000000; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 10:  mask0 = 0x00000000; mask1 = 0xffff0000; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 11:  mask0 = 0x00000000; mask1 = 0xffffff00; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 12:  mask0 = 0x00000000; mask1 = 0xffffffff; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 13:  mask0 = 0xff000000; mask1 = 0xffffffff; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 14:  mask0 = 0xffff0000; mask1 = 0xffffffff; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 15:  mask0 = 0xffffff00; mask1 = 0xffffffff; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
    case 16:  mask0 = 0xffffffff; mask1 = 0xffffffff; mask2 = 0xffffffff; mask3 = 0xffffffff; break;
  }

  const u32 data0 = data_buf[last_pad_elem - 3];
  const u32 data1 = data_buf[last_pad_elem - 2];
  const u32 data2 = data_buf[last_pad_elem - 1];
  const u32 data3 = data_buf[last_pad_elem - 0];

  if ((data0 & mask0) != (padm & mask0)) return -1;
  if ((data1 & mask1) != (padm & mask1)) return -1;
  if ((data2 & mask2) != (padm & mask2)) return -1;
  if ((data3 & mask3) != (padm & mask3)) return -1;

  const int real_len = data_len - pad;

  return real_len;
}

DECLSPEC int asn1_detect (const u32 *buf, const int len)
{
  if (len < 128)
  {
    if ((buf[0] & 0x00ff80ff) != 0x00020030) return 0;
  }
  else if (len < 256)
  {
    if ((buf[0] & 0xff00ffff) != 0x02008130) return 0;
  }
  else if (len < 65536)
  {
    if ((buf[0] & 0x0000ffff) != 0x00008230) return 0;
    if ((buf[1] & 0x000000ff) != 0x00000002) return 0;
  }

  if (len < 128)
  {
    const int lenb = ((buf[0] & 0x00007f00) >>  8);

    if ((lenb + 2) != len) return 0;
  }
  else if (len < 256)
  {
    const int lenb = ((buf[0] & 0x00ff0000) >> 16);

    if ((lenb + 3) != len) return 0;
  }
  else if (len < 65536)
  {
    const int lenb = ((buf[0] & 0xff000000) >> 24)
                   | ((buf[0] & 0x00ff0000) >>  8);

    if ((lenb + 4) != len) return 0;
  }

  return 1;
}

DECLSPEC u32 check_bitmap (GLOBAL_AS const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest)
{
  return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f)));
}

DECLSPEC u32 check (const u32 *digest, GLOBAL_AS const u32 *bitmap_s1_a, GLOBAL_AS const u32 *bitmap_s1_b, GLOBAL_AS const u32 *bitmap_s1_c, GLOBAL_AS const u32 *bitmap_s1_d, GLOBAL_AS const u32 *bitmap_s2_a, GLOBAL_AS const u32 *bitmap_s2_b, GLOBAL_AS const u32 *bitmap_s2_c, GLOBAL_AS const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2)
{
  if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0);
  if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0);
  if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0);
  if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0);

  if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0);
  if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0);
  if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0);
  if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0);

  return (1);
}

DECLSPEC void mark_hash (GLOBAL_AS plain_t *plains_buf, GLOBAL_AS u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u64 gid, const u32 il_pos, const u32 extra1, const u32 extra2)
{
  const u32 idx = hc_atomic_inc (d_result);

  #if ATTACK_MODE == 9

  #else
  if (idx >= digests_cnt)
  {
    // this is kind of tricky: we *must* call hc_atomic_inc() to know about the current value from a multi-thread perspective
    // this action creates a buffer overflow, so we need to fix it here

    hc_atomic_dec (d_result);

    return;
  }
  #endif

  plains_buf[idx].salt_pos   = salt_pos;
  plains_buf[idx].digest_pos = digest_pos;  // relative
  plains_buf[idx].hash_pos   = hash_pos;    // absolute
  plains_buf[idx].gidvid     = gid;
  plains_buf[idx].il_pos     = il_pos;
  plains_buf[idx].extra1     = extra1;      // unused so far
  plains_buf[idx].extra2     = extra2;      // unused so far
}

DECLSPEC int hc_count_char (const u32 *buf, const int elems, const u32 c)
{
  int r = 0;

  for (int i = 0; i < elems; i++)
  {
    const u32 v = buf[i];

    if (((v >>  0) & 0xff) == c) r++;
    if (((v >>  8) & 0xff) == c) r++;
    if (((v >> 16) & 0xff) == c) r++;
    if (((v >> 24) & 0xff) == c) r++;
  }

  return r;
}

DECLSPEC float hc_get_entropy (const u32 *buf, const int elems)
{
  const int length = elems * 4;

  float entropy = 0.0f;

  #ifdef _unroll
  #pragma unroll
  #endif
  for (u32 c = 0; c < 256; c++)
  {
    const int r = hc_count_char (buf, elems, c);

    if (r == 0) continue;

    float w = (float) r / length;

    entropy += -w * log2 (w);
  }

  return entropy;
}

DECLSPEC int is_valid_hex_8 (const u8 v)
{
  // direct lookup table is slower thanks to CMOV

  if ((v >= (u8) '0') && (v <= (u8) '9')) return 1;
  if ((v >= (u8) 'a') && (v <= (u8) 'f')) return 1;

  return 0;
}

DECLSPEC int is_valid_hex_32 (const u32 v)
{
  if (is_valid_hex_8 ((u8) (v >>  0)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >>  8)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >> 16)) == 0) return 0;
  if (is_valid_hex_8 ((u8) (v >> 24)) == 0) return 0;

  return 1;
}

DECLSPEC int is_valid_base58_8 (const u8 v)
{
  if (v > (u8) 'z') return 0;
  if (v < (u8) '1') return 0;
  if ((v > (u8) '9') && (v < (u8) 'A')) return 0;
  if ((v > (u8) 'Z') && (v < (u8) 'a')) return 0;

  return 1;
}

DECLSPEC int is_valid_base58_32 (const u32 v)
{
  if (is_valid_base58_8 ((u8) (v >>  0)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >>  8)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >> 16)) == 0) return 0;
  if (is_valid_base58_8 ((u8) (v >> 24)) == 0) return 0;

  return 1;
}

DECLSPEC int hc_find_keyboard_layout_map (const u32 search, const int search_len, LOCAL_AS keyboard_layout_mapping_t *s_keyboard_layout_mapping_buf, const int keyboard_layout_mapping_cnt)
{
  for (int idx = 0; idx < keyboard_layout_mapping_cnt; idx++)
  {
    const u32 src_char = s_keyboard_layout_mapping_buf[idx].src_char;
    const int src_len  = s_keyboard_layout_mapping_buf[idx].src_len;

    if (src_len == search_len)
    {
      const u32 mask = 0xffffffff >> ((4 - search_len) * 8);

      if ((src_char & mask) == (search & mask)) return idx;
    }
  }

  return -1;
}

DECLSPEC int hc_execute_keyboard_layout_mapping (u32 *w, const int pw_len, LOCAL_AS keyboard_layout_mapping_t *s_keyboard_layout_mapping_buf, const int keyboard_layout_mapping_cnt)
{
  u32 out_buf[32] = { 0 };

  u8 *out_ptr = (u8 *) out_buf;

  int out_len = 0;

  // TC/VC passwords are limited to 128

  u8 *w_ptr = (u8 *) w;

  int pw_pos = 0;

  while (pw_pos < pw_len)
  {
    u32 src0 = 0;
    u32 src1 = 0;
    u32 src2 = 0;
    u32 src3 = 0;

    #define MIN(a,b) (((a) < (b)) ? (a) : (b))

    const int rem = MIN (pw_len - pw_pos, 4);

    #undef MIN

    if (rem > 0) src0 = w_ptr[pw_pos + 0];
    if (rem > 1) src1 = w_ptr[pw_pos + 1];
    if (rem > 2) src2 = w_ptr[pw_pos + 2];
    if (rem > 3) src3 = w_ptr[pw_pos + 3];

    const u32 src = (src0 <<  0)
                  | (src1 <<  8)
                  | (src2 << 16)
                  | (src3 << 24);

    int src_len;

    for (src_len = rem; src_len > 0; src_len--)
    {
      const int idx = hc_find_keyboard_layout_map (src, src_len, s_keyboard_layout_mapping_buf, keyboard_layout_mapping_cnt);

      if (idx == -1) continue;

      u32 dst_char = s_keyboard_layout_mapping_buf[idx].dst_char;
      int dst_len  = s_keyboard_layout_mapping_buf[idx].dst_len;

      switch (dst_len)
      {
        case 1:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          break;
        case 2:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          break;
        case 3:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          out_ptr[out_len++] = (dst_char >> 16) & 0xff;
          break;
        case 4:
          out_ptr[out_len++] = (dst_char >>  0) & 0xff;
          out_ptr[out_len++] = (dst_char >>  8) & 0xff;
          out_ptr[out_len++] = (dst_char >> 16) & 0xff;
          out_ptr[out_len++] = (dst_char >> 24) & 0xff;
          break;
      }

      pw_pos += src_len;

      break;
    }

    // not matched, keep original

    if (src_len == 0)
    {
      out_ptr[out_len] = w_ptr[pw_pos];

      out_len++;

      pw_pos++;
    }
  }

  w[ 0] = out_buf[ 0];
  w[ 1] = out_buf[ 1];
  w[ 2] = out_buf[ 2];
  w[ 3] = out_buf[ 3];
  w[ 4] = out_buf[ 4];
  w[ 5] = out_buf[ 5];
  w[ 6] = out_buf[ 6];
  w[ 7] = out_buf[ 7];
  w[ 8] = out_buf[ 8];
  w[ 9] = out_buf[ 9];
  w[10] = out_buf[10];
  w[11] = out_buf[11];
  w[12] = out_buf[12];
  w[13] = out_buf[13];
  w[14] = out_buf[14];
  w[15] = out_buf[15];
  w[16] = out_buf[16];
  w[17] = out_buf[17];
  w[18] = out_buf[18];
  w[19] = out_buf[19];
  w[20] = out_buf[20];
  w[21] = out_buf[21];
  w[22] = out_buf[22];
  w[23] = out_buf[23];
  w[24] = out_buf[24];
  w[25] = out_buf[25];
  w[26] = out_buf[26];
  w[27] = out_buf[27];
  w[28] = out_buf[28];
  w[29] = out_buf[29];
  w[30] = out_buf[30];
  w[31] = out_buf[31];

  return out_len;
}

/**
 * vector functions
 */

DECLSPEC void make_utf16be (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x3727);
  out2[2] = hc_byte_perm (in[3], 0, 0x1707);
  out2[1] = hc_byte_perm (in[2], 0, 0x3727);
  out2[0] = hc_byte_perm (in[2], 0, 0x1707);
  out1[3] = hc_byte_perm (in[1], 0, 0x3727);
  out1[2] = hc_byte_perm (in[1], 0, 0x1707);
  out1[1] = hc_byte_perm (in[0], 0, 0x3727);
  out1[0] = hc_byte_perm (in[0], 0, 0x1707);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out2[3] = hc_byte_perm (in[3], 0, 0x03070207);
  out2[2] = hc_byte_perm (in[3], 0, 0x01070007);
  out2[1] = hc_byte_perm (in[2], 0, 0x03070207);
  out2[0] = hc_byte_perm (in[2], 0, 0x01070007);
  out1[3] = hc_byte_perm (in[1], 0, 0x03070207);
  out1[2] = hc_byte_perm (in[1], 0, 0x01070007);
  out1[1] = hc_byte_perm (in[0], 0, 0x03070207);
  out1[0] = hc_byte_perm (in[0], 0, 0x01070007);

  #else

  out2[3] = ((in[3] >>  0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00);
  out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00);
  out2[1] = ((in[2] >>  0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00);
  out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00);
  out1[3] = ((in[1] >>  0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00);
  out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00);
  out1[1] = ((in[0] >>  0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00);
  out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00);

  #endif
}

DECLSPEC void make_utf16beN (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x1707);
  out2[2] = hc_byte_perm (in[3], 0, 0x3727);
  out2[1] = hc_byte_perm (in[2], 0, 0x1707);
  out2[0] = hc_byte_perm (in[2], 0, 0x3727);
  out1[3] = hc_byte_perm (in[1], 0, 0x1707);
  out1[2] = hc_byte_perm (in[1], 0, 0x3727);
  out1[1] = hc_byte_perm (in[0], 0, 0x1707);
  out1[0] = hc_byte_perm (in[0], 0, 0x3727);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out2[3] = hc_byte_perm (in[3], 0, 0x01070007);
  out2[2] = hc_byte_perm (in[3], 0, 0x03070207);
  out2[1] = hc_byte_perm (in[2], 0, 0x01070007);
  out2[0] = hc_byte_perm (in[2], 0, 0x03070207);
  out1[3] = hc_byte_perm (in[1], 0, 0x01070007);
  out1[2] = hc_byte_perm (in[1], 0, 0x03070207);
  out1[1] = hc_byte_perm (in[0], 0, 0x01070007);
  out1[0] = hc_byte_perm (in[0], 0, 0x03070207);

  #else

  out2[3] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00);
  out2[2] = ((in[3] >>  0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00);
  out2[1] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00);
  out2[0] = ((in[2] >>  0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00);
  out1[3] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00);
  out1[2] = ((in[1] >>  0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00);
  out1[1] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00);
  out1[0] = ((in[0] >>  0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00);

  #endif
}

DECLSPEC void make_utf16le (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x7372);
  out2[2] = hc_byte_perm (in[3], 0, 0x7170);
  out2[1] = hc_byte_perm (in[2], 0, 0x7372);
  out2[0] = hc_byte_perm (in[2], 0, 0x7170);
  out1[3] = hc_byte_perm (in[1], 0, 0x7372);
  out1[2] = hc_byte_perm (in[1], 0, 0x7170);
  out1[1] = hc_byte_perm (in[0], 0, 0x7372);
  out1[0] = hc_byte_perm (in[0], 0, 0x7170);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out2[3] = hc_byte_perm (in[3], 0, 0x07030702);
  out2[2] = hc_byte_perm (in[3], 0, 0x07010700);
  out2[1] = hc_byte_perm (in[2], 0, 0x07030702);
  out2[0] = hc_byte_perm (in[2], 0, 0x07010700);
  out1[3] = hc_byte_perm (in[1], 0, 0x07030702);
  out1[2] = hc_byte_perm (in[1], 0, 0x07010700);
  out1[1] = hc_byte_perm (in[0], 0, 0x07030702);
  out1[0] = hc_byte_perm (in[0], 0, 0x07010700);

  #else

  out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
  out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >>  0) & 0x000000FF);
  out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
  out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >>  0) & 0x000000FF);
  out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
  out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >>  0) & 0x000000FF);
  out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);
  out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >>  0) & 0x000000FF);

  #endif
}

DECLSPEC void make_utf16leN (const u32x *in, u32x *out1, u32x *out2)
{
  #if defined IS_NV

  out2[3] = hc_byte_perm (in[3], 0, 0x7170);
  out2[2] = hc_byte_perm (in[3], 0, 0x7372);
  out2[1] = hc_byte_perm (in[2], 0, 0x7170);
  out2[0] = hc_byte_perm (in[2], 0, 0x7372);
  out1[3] = hc_byte_perm (in[1], 0, 0x7170);
  out1[2] = hc_byte_perm (in[1], 0, 0x7372);
  out1[1] = hc_byte_perm (in[0], 0, 0x7170);
  out1[0] = hc_byte_perm (in[0], 0, 0x7372);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out2[3] = hc_byte_perm (in[3], 0, 0x07010700);
  out2[2] = hc_byte_perm (in[3], 0, 0x07030702);
  out2[1] = hc_byte_perm (in[2], 0, 0x07010700);
  out2[0] = hc_byte_perm (in[2], 0, 0x07030702);
  out1[3] = hc_byte_perm (in[1], 0, 0x07010700);
  out1[2] = hc_byte_perm (in[1], 0, 0x07030702);
  out1[1] = hc_byte_perm (in[0], 0, 0x07010700);
  out1[0] = hc_byte_perm (in[0], 0, 0x07030702);

  #else

  out2[3] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >>  0) & 0x000000FF);
  out2[2] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF);
  out2[1] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >>  0) & 0x000000FF);
  out2[0] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF);
  out1[3] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >>  0) & 0x000000FF);
  out1[2] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF);
  out1[1] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >>  0) & 0x000000FF);
  out1[0] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF);

  #endif
}

DECLSPEC void undo_utf16be (const u32x *in1, const u32x *in2, u32x *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm (in1[0], in1[1], 0x4602);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x4602);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x4602);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x4602);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out[0] = hc_byte_perm (in1[0], in1[1], 0x04060002);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x04060002);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x04060002);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x04060002);

  #else

  out[0] = ((in1[0] & 0x0000ff00) >>  8) | ((in1[0] & 0xff000000) >> 16)
         | ((in1[1] & 0x0000ff00) <<  8) | ((in1[1] & 0xff000000) <<  0);
  out[1] = ((in1[2] & 0x0000ff00) >>  8) | ((in1[2] & 0xff000000) >> 16)
         | ((in1[3] & 0x0000ff00) <<  8) | ((in1[3] & 0xff000000) <<  0);
  out[2] = ((in2[0] & 0x0000ff00) >>  8) | ((in2[0] & 0xff000000) >> 16)
         | ((in2[1] & 0x0000ff00) <<  8) | ((in2[1] & 0xff000000) <<  0);
  out[3] = ((in2[2] & 0x0000ff00) >>  8) | ((in2[2] & 0xff000000) >> 16)
         | ((in2[3] & 0x0000ff00) <<  8) | ((in2[3] & 0xff000000) <<  0);

  #endif
}

DECLSPEC void undo_utf16le (const u32x *in1, const u32x *in2, u32x *out)
{
  #if defined IS_NV

  out[0] = hc_byte_perm (in1[0], in1[1], 0x6420);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x6420);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x6420);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x6420);

  #elif (defined IS_AMD || defined IS_HIP) && defined HAS_VPERM

  out[0] = hc_byte_perm (in1[0], in1[1], 0x06040200);
  out[1] = hc_byte_perm (in1[2], in1[3], 0x06040200);
  out[2] = hc_byte_perm (in2[0], in2[1], 0x06040200);
  out[3] = hc_byte_perm (in2[2], in2[3], 0x06040200);

  #else

  out[0] = ((in1[0] & 0x000000ff) >>  0) | ((in1[0] & 0x00ff0000) >>  8)
         | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) <<  8);
  out[1] = ((in1[2] & 0x000000ff) >>  0) | ((in1[2] & 0x00ff0000) >>  8)
         | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) <<  8);
  out[2] = ((in2[0] & 0x000000ff) >>  0) | ((in2[0] & 0x00ff0000) >>  8)
         | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) <<  8);
  out[3] = ((in2[2] & 0x000000ff) >>  0) | ((in2[2] & 0x00ff0000) >>  8)
         | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) <<  8);

  #endif
}

DECLSPEC void set_mark_1x4 (u32 *v, const u32 offset)
{
  const u32 c = (offset & 15) / 4;
  const u32 r = 0xff << ((offset & 3) * 8);

  v[0] = (c == 0) ? r : 0;
  v[1] = (c == 1) ? r : 0;
  v[2] = (c == 2) ? r : 0;
  v[3] = (c == 3) ? r : 0;
}

DECLSPEC void append_helper_1x4 (u32x *r, const u32 v, const u32 *m)
{
  r[0] |= v & m[0];
  r[1] |= v & m[1];
  r[2] |= v & m[2];
  r[3] |= v & m[3];
}

DECLSPEC void append_0x80_1x4 (u32x *w0, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  append_helper_1x4 (w0, 0x80808080, v);
}

DECLSPEC void append_0x80_2x4 (u32x *w0, u32x *w1, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_3x4 (u32x *w0, u32x *w1, u32x *w2, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_4x4 (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_8x4 (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v);
  append_helper_1x4 (w4, ((offset16 == 4) ? 0x80808080 : 0), v);
  append_helper_1x4 (w5, ((offset16 == 5) ? 0x80808080 : 0), v);
  append_helper_1x4 (w6, ((offset16 == 6) ? 0x80808080 : 0), v);
  append_helper_1x4 (w7, ((offset16 == 7) ? 0x80808080 : 0), v);
}

DECLSPEC void append_0x80_1x16 (u32x *w, const u32 offset)
{
  u32 v[4];

  set_mark_1x4 (v, offset);

  const u32 offset16 = offset / 16;

  append_helper_1x4 (w +  0, ((offset16 == 0) ? 0x80808080 : 0), v);
  append_helper_1x4 (w +  4, ((offset16 == 1) ? 0x80808080 : 0), v);
  append_helper_1x4 (w +  8, ((offset16 == 2) ? 0x80808080 : 0), v);
  append_helper_1x4 (w + 12, ((offset16 == 3) ? 0x80808080 : 0), v);
}

DECLSPEC void switch_buffer_by_offset_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm (    0, w0[0], selector);

      break;

    case  1:
      w3[3] = hc_byte_perm (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm (    0, w0[0], selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign (w3[3],     0, offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign (w3[3],     0, offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign (w3[3],     0, offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign (w3[3],     0, offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign (w3[3],     0, offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign (w3[3],     0, offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign (w3[3],     0, offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign (w3[3],     0, offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign (w3[3],     0, offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign (w3[3],     0, offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w3[3],     0, offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w3[3],     0, offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w3[3],     0, offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w3[3],     0, offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w3[3],     0, offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w3[3],     0, offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #ifdef IS_NV
  // atm only same code as for AMD, but could be improved
  switch (offset_switch)
  {
    case 0:
      c0[0] = hc_bytealign (w3[3],     0, offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case 1:
      c0[1] = hc_bytealign (w3[3],     0, offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case 2:
      c0[2] = hc_bytealign (w3[3],     0, offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 3:
      c0[3] = hc_bytealign (w3[3],     0, offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 4:
      c1[0] = hc_bytealign (w3[3],     0, offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 5:
      c1[1] = hc_bytealign (w3[3],     0, offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 6:
      c1[2] = hc_bytealign (w3[3],     0, offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 7:
      c1[3] = hc_bytealign (w3[3],     0, offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 8:
      c2[0] = hc_bytealign (w3[3],     0, offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 9:
      c2[1] = hc_bytealign (w3[3],     0, offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w3[3],     0, offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w3[3],     0, offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w3[3],     0, offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w3[3],     0, offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w3[3],     0, offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w3[3],     0, offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_be (    0, w0[0], offset);

      break;

    case  1:
      w3[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_be (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_be (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_be (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_be (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_be (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_be (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_be (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_be (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_be (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_be (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_be (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_be (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_be (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_be (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_bytealign_be (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign_be (w3[3],     0, offset);
      w3[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_be (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign_be (w3[3],     0, offset);
      c0[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_be (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign_be (w3[3],     0, offset);
      c0[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_be (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign_be (w3[3],     0, offset);
      c0[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_be (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign_be (w3[3],     0, offset);
      c0[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_be (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign_be (w3[3],     0, offset);
      c1[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_be (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign_be (w3[3],     0, offset);
      c1[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_be (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign_be (w3[3],     0, offset);
      c1[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_be (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign_be (w3[3],     0, offset);
      c1[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[0] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[3] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[2] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[1] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_be (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign_be (w3[3],     0, offset);
      c2[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[0] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[3] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[2] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[1] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_be (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign_be (w3[3],     0, offset);
      c2[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[0] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[3] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[2] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[1] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_be (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign_be (w3[3],     0, offset);
      c2[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[0] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[3] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[2] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[1] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_be (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign_be (w3[3],     0, offset);
      c2[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[0] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[3] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[2] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[1] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[0] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[3] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[2] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[1] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_be (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign_be (w3[3],     0, offset);
      c3[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[0] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[3] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[2] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[1] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[0] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[3] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[2] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[1] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_be (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign_be (w3[3],     0, offset);
      c3[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c3[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c2[0] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[3] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[2] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[1] = hc_bytealign_be (w1[2], w1[3], offset);
      c1[0] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[3] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[2] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[1] = hc_bytealign_be (w0[2], w0[3], offset);
      c0[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_be (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign_be (w3[3],     0, offset);
      c3[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c3[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c3[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c2[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c2[0] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[3] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[2] = hc_bytealign_be (w1[2], w1[3], offset);
      c1[1] = hc_bytealign_be (w1[1], w1[2], offset);
      c1[0] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[3] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[2] = hc_bytealign_be (w0[2], w0[3], offset);
      c0[1] = hc_bytealign_be (w0[1], w0[2], offset);
      c0[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_be (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm (    0, w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm (    0, w3[3], selector);
      c0[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm (    0, w3[3], selector);
      c0[1] = hc_byte_perm (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm (    0, w3[3], selector);
      c0[2] = hc_byte_perm (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm (    0, w3[3], selector);
      c0[3] = hc_byte_perm (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm (    0, w3[3], selector);
      c1[0] = hc_byte_perm (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm (    0, w3[3], selector);
      c1[1] = hc_byte_perm (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm (    0, w3[3], selector);
      c1[2] = hc_byte_perm (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm (    0, w3[3], selector);
      c1[3] = hc_byte_perm (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm (    0, w3[3], selector);
      c2[0] = hc_byte_perm (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm (    0, w3[3], selector);
      c2[1] = hc_byte_perm (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm (    0, w3[3], selector);
      c2[2] = hc_byte_perm (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm (    0, w3[3], selector);
      c2[3] = hc_byte_perm (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm (    0, w3[3], selector);
      c3[0] = hc_byte_perm (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm (    0, w3[3], selector);
      c3[1] = hc_byte_perm (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm (    0, w3[3], selector);
      c3[2] = hc_byte_perm (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign (w7[2], w7[3], offset);
      w7[2] = hc_bytealign (w7[1], w7[2], offset);
      w7[1] = hc_bytealign (w7[0], w7[1], offset);
      w7[0] = hc_bytealign (w6[3], w7[0], offset);
      w6[3] = hc_bytealign (w6[2], w6[3], offset);
      w6[2] = hc_bytealign (w6[1], w6[2], offset);
      w6[1] = hc_bytealign (w6[0], w6[1], offset);
      w6[0] = hc_bytealign (w5[3], w6[0], offset);
      w5[3] = hc_bytealign (w5[2], w5[3], offset);
      w5[2] = hc_bytealign (w5[1], w5[2], offset);
      w5[1] = hc_bytealign (w5[0], w5[1], offset);
      w5[0] = hc_bytealign (w4[3], w5[0], offset);
      w4[3] = hc_bytealign (w4[2], w4[3], offset);
      w4[2] = hc_bytealign (w4[1], w4[2], offset);
      w4[1] = hc_bytealign (w4[0], w4[1], offset);
      w4[0] = hc_bytealign (w3[3], w4[0], offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign (w7[1], w7[2], offset);
      w7[2] = hc_bytealign (w7[0], w7[1], offset);
      w7[1] = hc_bytealign (w6[3], w7[0], offset);
      w7[0] = hc_bytealign (w6[2], w6[3], offset);
      w6[3] = hc_bytealign (w6[1], w6[2], offset);
      w6[2] = hc_bytealign (w6[0], w6[1], offset);
      w6[1] = hc_bytealign (w5[3], w6[0], offset);
      w6[0] = hc_bytealign (w5[2], w5[3], offset);
      w5[3] = hc_bytealign (w5[1], w5[2], offset);
      w5[2] = hc_bytealign (w5[0], w5[1], offset);
      w5[1] = hc_bytealign (w4[3], w5[0], offset);
      w5[0] = hc_bytealign (w4[2], w4[3], offset);
      w4[3] = hc_bytealign (w4[1], w4[2], offset);
      w4[2] = hc_bytealign (w4[0], w4[1], offset);
      w4[1] = hc_bytealign (w3[3], w4[0], offset);
      w4[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign (w7[0], w7[1], offset);
      w7[2] = hc_bytealign (w6[3], w7[0], offset);
      w7[1] = hc_bytealign (w6[2], w6[3], offset);
      w7[0] = hc_bytealign (w6[1], w6[2], offset);
      w6[3] = hc_bytealign (w6[0], w6[1], offset);
      w6[2] = hc_bytealign (w5[3], w6[0], offset);
      w6[1] = hc_bytealign (w5[2], w5[3], offset);
      w6[0] = hc_bytealign (w5[1], w5[2], offset);
      w5[3] = hc_bytealign (w5[0], w5[1], offset);
      w5[2] = hc_bytealign (w4[3], w5[0], offset);
      w5[1] = hc_bytealign (w4[2], w4[3], offset);
      w5[0] = hc_bytealign (w4[1], w4[2], offset);
      w4[3] = hc_bytealign (w4[0], w4[1], offset);
      w4[2] = hc_bytealign (w3[3], w4[0], offset);
      w4[1] = hc_bytealign (w3[2], w3[3], offset);
      w4[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign (w6[3], w7[0], offset);
      w7[2] = hc_bytealign (w6[2], w6[3], offset);
      w7[1] = hc_bytealign (w6[1], w6[2], offset);
      w7[0] = hc_bytealign (w6[0], w6[1], offset);
      w6[3] = hc_bytealign (w5[3], w6[0], offset);
      w6[2] = hc_bytealign (w5[2], w5[3], offset);
      w6[1] = hc_bytealign (w5[1], w5[2], offset);
      w6[0] = hc_bytealign (w5[0], w5[1], offset);
      w5[3] = hc_bytealign (w4[3], w5[0], offset);
      w5[2] = hc_bytealign (w4[2], w4[3], offset);
      w5[1] = hc_bytealign (w4[1], w4[2], offset);
      w5[0] = hc_bytealign (w4[0], w4[1], offset);
      w4[3] = hc_bytealign (w3[3], w4[0], offset);
      w4[2] = hc_bytealign (w3[2], w3[3], offset);
      w4[1] = hc_bytealign (w3[1], w3[2], offset);
      w4[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign (w6[2], w6[3], offset);
      w7[2] = hc_bytealign (w6[1], w6[2], offset);
      w7[1] = hc_bytealign (w6[0], w6[1], offset);
      w7[0] = hc_bytealign (w5[3], w6[0], offset);
      w6[3] = hc_bytealign (w5[2], w5[3], offset);
      w6[2] = hc_bytealign (w5[1], w5[2], offset);
      w6[1] = hc_bytealign (w5[0], w5[1], offset);
      w6[0] = hc_bytealign (w4[3], w5[0], offset);
      w5[3] = hc_bytealign (w4[2], w4[3], offset);
      w5[2] = hc_bytealign (w4[1], w4[2], offset);
      w5[1] = hc_bytealign (w4[0], w4[1], offset);
      w5[0] = hc_bytealign (w3[3], w4[0], offset);
      w4[3] = hc_bytealign (w3[2], w3[3], offset);
      w4[2] = hc_bytealign (w3[1], w3[2], offset);
      w4[1] = hc_bytealign (w3[0], w3[1], offset);
      w4[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign (w6[1], w6[2], offset);
      w7[2] = hc_bytealign (w6[0], w6[1], offset);
      w7[1] = hc_bytealign (w5[3], w6[0], offset);
      w7[0] = hc_bytealign (w5[2], w5[3], offset);
      w6[3] = hc_bytealign (w5[1], w5[2], offset);
      w6[2] = hc_bytealign (w5[0], w5[1], offset);
      w6[1] = hc_bytealign (w4[3], w5[0], offset);
      w6[0] = hc_bytealign (w4[2], w4[3], offset);
      w5[3] = hc_bytealign (w4[1], w4[2], offset);
      w5[2] = hc_bytealign (w4[0], w4[1], offset);
      w5[1] = hc_bytealign (w3[3], w4[0], offset);
      w5[0] = hc_bytealign (w3[2], w3[3], offset);
      w4[3] = hc_bytealign (w3[1], w3[2], offset);
      w4[2] = hc_bytealign (w3[0], w3[1], offset);
      w4[1] = hc_bytealign (w2[3], w3[0], offset);
      w4[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign (w6[0], w6[1], offset);
      w7[2] = hc_bytealign (w5[3], w6[0], offset);
      w7[1] = hc_bytealign (w5[2], w5[3], offset);
      w7[0] = hc_bytealign (w5[1], w5[2], offset);
      w6[3] = hc_bytealign (w5[0], w5[1], offset);
      w6[2] = hc_bytealign (w4[3], w5[0], offset);
      w6[1] = hc_bytealign (w4[2], w4[3], offset);
      w6[0] = hc_bytealign (w4[1], w4[2], offset);
      w5[3] = hc_bytealign (w4[0], w4[1], offset);
      w5[2] = hc_bytealign (w3[3], w4[0], offset);
      w5[1] = hc_bytealign (w3[2], w3[3], offset);
      w5[0] = hc_bytealign (w3[1], w3[2], offset);
      w4[3] = hc_bytealign (w3[0], w3[1], offset);
      w4[2] = hc_bytealign (w2[3], w3[0], offset);
      w4[1] = hc_bytealign (w2[2], w2[3], offset);
      w4[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign (w5[3], w6[0], offset);
      w7[2] = hc_bytealign (w5[2], w5[3], offset);
      w7[1] = hc_bytealign (w5[1], w5[2], offset);
      w7[0] = hc_bytealign (w5[0], w5[1], offset);
      w6[3] = hc_bytealign (w4[3], w5[0], offset);
      w6[2] = hc_bytealign (w4[2], w4[3], offset);
      w6[1] = hc_bytealign (w4[1], w4[2], offset);
      w6[0] = hc_bytealign (w4[0], w4[1], offset);
      w5[3] = hc_bytealign (w3[3], w4[0], offset);
      w5[2] = hc_bytealign (w3[2], w3[3], offset);
      w5[1] = hc_bytealign (w3[1], w3[2], offset);
      w5[0] = hc_bytealign (w3[0], w3[1], offset);
      w4[3] = hc_bytealign (w2[3], w3[0], offset);
      w4[2] = hc_bytealign (w2[2], w2[3], offset);
      w4[1] = hc_bytealign (w2[1], w2[2], offset);
      w4[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign (w5[2], w5[3], offset);
      w7[2] = hc_bytealign (w5[1], w5[2], offset);
      w7[1] = hc_bytealign (w5[0], w5[1], offset);
      w7[0] = hc_bytealign (w4[3], w5[0], offset);
      w6[3] = hc_bytealign (w4[2], w4[3], offset);
      w6[2] = hc_bytealign (w4[1], w4[2], offset);
      w6[1] = hc_bytealign (w4[0], w4[1], offset);
      w6[0] = hc_bytealign (w3[3], w4[0], offset);
      w5[3] = hc_bytealign (w3[2], w3[3], offset);
      w5[2] = hc_bytealign (w3[1], w3[2], offset);
      w5[1] = hc_bytealign (w3[0], w3[1], offset);
      w5[0] = hc_bytealign (w2[3], w3[0], offset);
      w4[3] = hc_bytealign (w2[2], w2[3], offset);
      w4[2] = hc_bytealign (w2[1], w2[2], offset);
      w4[1] = hc_bytealign (w2[0], w2[1], offset);
      w4[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign (w5[1], w5[2], offset);
      w7[2] = hc_bytealign (w5[0], w5[1], offset);
      w7[1] = hc_bytealign (w4[3], w5[0], offset);
      w7[0] = hc_bytealign (w4[2], w4[3], offset);
      w6[3] = hc_bytealign (w4[1], w4[2], offset);
      w6[2] = hc_bytealign (w4[0], w4[1], offset);
      w6[1] = hc_bytealign (w3[3], w4[0], offset);
      w6[0] = hc_bytealign (w3[2], w3[3], offset);
      w5[3] = hc_bytealign (w3[1], w3[2], offset);
      w5[2] = hc_bytealign (w3[0], w3[1], offset);
      w5[1] = hc_bytealign (w2[3], w3[0], offset);
      w5[0] = hc_bytealign (w2[2], w2[3], offset);
      w4[3] = hc_bytealign (w2[1], w2[2], offset);
      w4[2] = hc_bytealign (w2[0], w2[1], offset);
      w4[1] = hc_bytealign (w1[3], w2[0], offset);
      w4[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign (w5[0], w5[1], offset);
      w7[2] = hc_bytealign (w4[3], w5[0], offset);
      w7[1] = hc_bytealign (w4[2], w4[3], offset);
      w7[0] = hc_bytealign (w4[1], w4[2], offset);
      w6[3] = hc_bytealign (w4[0], w4[1], offset);
      w6[2] = hc_bytealign (w3[3], w4[0], offset);
      w6[1] = hc_bytealign (w3[2], w3[3], offset);
      w6[0] = hc_bytealign (w3[1], w3[2], offset);
      w5[3] = hc_bytealign (w3[0], w3[1], offset);
      w5[2] = hc_bytealign (w2[3], w3[0], offset);
      w5[1] = hc_bytealign (w2[2], w2[3], offset);
      w5[0] = hc_bytealign (w2[1], w2[2], offset);
      w4[3] = hc_bytealign (w2[0], w2[1], offset);
      w4[2] = hc_bytealign (w1[3], w2[0], offset);
      w4[1] = hc_bytealign (w1[2], w1[3], offset);
      w4[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign (w4[3], w5[0], offset);
      w7[2] = hc_bytealign (w4[2], w4[3], offset);
      w7[1] = hc_bytealign (w4[1], w4[2], offset);
      w7[0] = hc_bytealign (w4[0], w4[1], offset);
      w6[3] = hc_bytealign (w3[3], w4[0], offset);
      w6[2] = hc_bytealign (w3[2], w3[3], offset);
      w6[1] = hc_bytealign (w3[1], w3[2], offset);
      w6[0] = hc_bytealign (w3[0], w3[1], offset);
      w5[3] = hc_bytealign (w2[3], w3[0], offset);
      w5[2] = hc_bytealign (w2[2], w2[3], offset);
      w5[1] = hc_bytealign (w2[1], w2[2], offset);
      w5[0] = hc_bytealign (w2[0], w2[1], offset);
      w4[3] = hc_bytealign (w1[3], w2[0], offset);
      w4[2] = hc_bytealign (w1[2], w1[3], offset);
      w4[1] = hc_bytealign (w1[1], w1[2], offset);
      w4[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign (w4[2], w4[3], offset);
      w7[2] = hc_bytealign (w4[1], w4[2], offset);
      w7[1] = hc_bytealign (w4[0], w4[1], offset);
      w7[0] = hc_bytealign (w3[3], w4[0], offset);
      w6[3] = hc_bytealign (w3[2], w3[3], offset);
      w6[2] = hc_bytealign (w3[1], w3[2], offset);
      w6[1] = hc_bytealign (w3[0], w3[1], offset);
      w6[0] = hc_bytealign (w2[3], w3[0], offset);
      w5[3] = hc_bytealign (w2[2], w2[3], offset);
      w5[2] = hc_bytealign (w2[1], w2[2], offset);
      w5[1] = hc_bytealign (w2[0], w2[1], offset);
      w5[0] = hc_bytealign (w1[3], w2[0], offset);
      w4[3] = hc_bytealign (w1[2], w1[3], offset);
      w4[2] = hc_bytealign (w1[1], w1[2], offset);
      w4[1] = hc_bytealign (w1[0], w1[1], offset);
      w4[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign (w4[1], w4[2], offset);
      w7[2] = hc_bytealign (w4[0], w4[1], offset);
      w7[1] = hc_bytealign (w3[3], w4[0], offset);
      w7[0] = hc_bytealign (w3[2], w3[3], offset);
      w6[3] = hc_bytealign (w3[1], w3[2], offset);
      w6[2] = hc_bytealign (w3[0], w3[1], offset);
      w6[1] = hc_bytealign (w2[3], w3[0], offset);
      w6[0] = hc_bytealign (w2[2], w2[3], offset);
      w5[3] = hc_bytealign (w2[1], w2[2], offset);
      w5[2] = hc_bytealign (w2[0], w2[1], offset);
      w5[1] = hc_bytealign (w1[3], w2[0], offset);
      w5[0] = hc_bytealign (w1[2], w1[3], offset);
      w4[3] = hc_bytealign (w1[1], w1[2], offset);
      w4[2] = hc_bytealign (w1[0], w1[1], offset);
      w4[1] = hc_bytealign (w0[3], w1[0], offset);
      w4[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign (w4[0], w4[1], offset);
      w7[2] = hc_bytealign (w3[3], w4[0], offset);
      w7[1] = hc_bytealign (w3[2], w3[3], offset);
      w7[0] = hc_bytealign (w3[1], w3[2], offset);
      w6[3] = hc_bytealign (w3[0], w3[1], offset);
      w6[2] = hc_bytealign (w2[3], w3[0], offset);
      w6[1] = hc_bytealign (w2[2], w2[3], offset);
      w6[0] = hc_bytealign (w2[1], w2[2], offset);
      w5[3] = hc_bytealign (w2[0], w2[1], offset);
      w5[2] = hc_bytealign (w1[3], w2[0], offset);
      w5[1] = hc_bytealign (w1[2], w1[3], offset);
      w5[0] = hc_bytealign (w1[1], w1[2], offset);
      w4[3] = hc_bytealign (w1[0], w1[1], offset);
      w4[2] = hc_bytealign (w0[3], w1[0], offset);
      w4[1] = hc_bytealign (w0[2], w0[3], offset);
      w4[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign (w3[3], w4[0], offset);
      w7[2] = hc_bytealign (w3[2], w3[3], offset);
      w7[1] = hc_bytealign (w3[1], w3[2], offset);
      w7[0] = hc_bytealign (w3[0], w3[1], offset);
      w6[3] = hc_bytealign (w2[3], w3[0], offset);
      w6[2] = hc_bytealign (w2[2], w2[3], offset);
      w6[1] = hc_bytealign (w2[1], w2[2], offset);
      w6[0] = hc_bytealign (w2[0], w2[1], offset);
      w5[3] = hc_bytealign (w1[3], w2[0], offset);
      w5[2] = hc_bytealign (w1[2], w1[3], offset);
      w5[1] = hc_bytealign (w1[1], w1[2], offset);
      w5[0] = hc_bytealign (w1[0], w1[1], offset);
      w4[3] = hc_bytealign (w0[3], w1[0], offset);
      w4[2] = hc_bytealign (w0[2], w0[3], offset);
      w4[1] = hc_bytealign (w0[1], w0[2], offset);
      w4[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign (w3[2], w3[3], offset);
      w7[2] = hc_bytealign (w3[1], w3[2], offset);
      w7[1] = hc_bytealign (w3[0], w3[1], offset);
      w7[0] = hc_bytealign (w2[3], w3[0], offset);
      w6[3] = hc_bytealign (w2[2], w2[3], offset);
      w6[2] = hc_bytealign (w2[1], w2[2], offset);
      w6[1] = hc_bytealign (w2[0], w2[1], offset);
      w6[0] = hc_bytealign (w1[3], w2[0], offset);
      w5[3] = hc_bytealign (w1[2], w1[3], offset);
      w5[2] = hc_bytealign (w1[1], w1[2], offset);
      w5[1] = hc_bytealign (w1[0], w1[1], offset);
      w5[0] = hc_bytealign (w0[3], w1[0], offset);
      w4[3] = hc_bytealign (w0[2], w0[3], offset);
      w4[2] = hc_bytealign (w0[1], w0[2], offset);
      w4[1] = hc_bytealign (w0[0], w0[1], offset);
      w4[0] = hc_bytealign (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign (w3[1], w3[2], offset);
      w7[2] = hc_bytealign (w3[0], w3[1], offset);
      w7[1] = hc_bytealign (w2[3], w3[0], offset);
      w7[0] = hc_bytealign (w2[2], w2[3], offset);
      w6[3] = hc_bytealign (w2[1], w2[2], offset);
      w6[2] = hc_bytealign (w2[0], w2[1], offset);
      w6[1] = hc_bytealign (w1[3], w2[0], offset);
      w6[0] = hc_bytealign (w1[2], w1[3], offset);
      w5[3] = hc_bytealign (w1[1], w1[2], offset);
      w5[2] = hc_bytealign (w1[0], w1[1], offset);
      w5[1] = hc_bytealign (w0[3], w1[0], offset);
      w5[0] = hc_bytealign (w0[2], w0[3], offset);
      w4[3] = hc_bytealign (w0[1], w0[2], offset);
      w4[2] = hc_bytealign (w0[0], w0[1], offset);
      w4[1] = hc_bytealign (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign (w3[0], w3[1], offset);
      w7[2] = hc_bytealign (w2[3], w3[0], offset);
      w7[1] = hc_bytealign (w2[2], w2[3], offset);
      w7[0] = hc_bytealign (w2[1], w2[2], offset);
      w6[3] = hc_bytealign (w2[0], w2[1], offset);
      w6[2] = hc_bytealign (w1[3], w2[0], offset);
      w6[1] = hc_bytealign (w1[2], w1[3], offset);
      w6[0] = hc_bytealign (w1[1], w1[2], offset);
      w5[3] = hc_bytealign (w1[0], w1[1], offset);
      w5[2] = hc_bytealign (w0[3], w1[0], offset);
      w5[1] = hc_bytealign (w0[2], w0[3], offset);
      w5[0] = hc_bytealign (w0[1], w0[2], offset);
      w4[3] = hc_bytealign (w0[0], w0[1], offset);
      w4[2] = hc_bytealign (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign (w2[3], w3[0], offset);
      w7[2] = hc_bytealign (w2[2], w2[3], offset);
      w7[1] = hc_bytealign (w2[1], w2[2], offset);
      w7[0] = hc_bytealign (w2[0], w2[1], offset);
      w6[3] = hc_bytealign (w1[3], w2[0], offset);
      w6[2] = hc_bytealign (w1[2], w1[3], offset);
      w6[1] = hc_bytealign (w1[1], w1[2], offset);
      w6[0] = hc_bytealign (w1[0], w1[1], offset);
      w5[3] = hc_bytealign (w0[3], w1[0], offset);
      w5[2] = hc_bytealign (w0[2], w0[3], offset);
      w5[1] = hc_bytealign (w0[1], w0[2], offset);
      w5[0] = hc_bytealign (w0[0], w0[1], offset);
      w4[3] = hc_bytealign (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign (w2[2], w2[3], offset);
      w7[2] = hc_bytealign (w2[1], w2[2], offset);
      w7[1] = hc_bytealign (w2[0], w2[1], offset);
      w7[0] = hc_bytealign (w1[3], w2[0], offset);
      w6[3] = hc_bytealign (w1[2], w1[3], offset);
      w6[2] = hc_bytealign (w1[1], w1[2], offset);
      w6[1] = hc_bytealign (w1[0], w1[1], offset);
      w6[0] = hc_bytealign (w0[3], w1[0], offset);
      w5[3] = hc_bytealign (w0[2], w0[3], offset);
      w5[2] = hc_bytealign (w0[1], w0[2], offset);
      w5[1] = hc_bytealign (w0[0], w0[1], offset);
      w5[0] = hc_bytealign (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign (w2[1], w2[2], offset);
      w7[2] = hc_bytealign (w2[0], w2[1], offset);
      w7[1] = hc_bytealign (w1[3], w2[0], offset);
      w7[0] = hc_bytealign (w1[2], w1[3], offset);
      w6[3] = hc_bytealign (w1[1], w1[2], offset);
      w6[2] = hc_bytealign (w1[0], w1[1], offset);
      w6[1] = hc_bytealign (w0[3], w1[0], offset);
      w6[0] = hc_bytealign (w0[2], w0[3], offset);
      w5[3] = hc_bytealign (w0[1], w0[2], offset);
      w5[2] = hc_bytealign (w0[0], w0[1], offset);
      w5[1] = hc_bytealign (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign (w2[0], w2[1], offset);
      w7[2] = hc_bytealign (w1[3], w2[0], offset);
      w7[1] = hc_bytealign (w1[2], w1[3], offset);
      w7[0] = hc_bytealign (w1[1], w1[2], offset);
      w6[3] = hc_bytealign (w1[0], w1[1], offset);
      w6[2] = hc_bytealign (w0[3], w1[0], offset);
      w6[1] = hc_bytealign (w0[2], w0[3], offset);
      w6[0] = hc_bytealign (w0[1], w0[2], offset);
      w5[3] = hc_bytealign (w0[0], w0[1], offset);
      w5[2] = hc_bytealign (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign (w1[3], w2[0], offset);
      w7[2] = hc_bytealign (w1[2], w1[3], offset);
      w7[1] = hc_bytealign (w1[1], w1[2], offset);
      w7[0] = hc_bytealign (w1[0], w1[1], offset);
      w6[3] = hc_bytealign (w0[3], w1[0], offset);
      w6[2] = hc_bytealign (w0[2], w0[3], offset);
      w6[1] = hc_bytealign (w0[1], w0[2], offset);
      w6[0] = hc_bytealign (w0[0], w0[1], offset);
      w5[3] = hc_bytealign (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign (w1[2], w1[3], offset);
      w7[2] = hc_bytealign (w1[1], w1[2], offset);
      w7[1] = hc_bytealign (w1[0], w1[1], offset);
      w7[0] = hc_bytealign (w0[3], w1[0], offset);
      w6[3] = hc_bytealign (w0[2], w0[3], offset);
      w6[2] = hc_bytealign (w0[1], w0[2], offset);
      w6[1] = hc_bytealign (w0[0], w0[1], offset);
      w6[0] = hc_bytealign (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign (w1[1], w1[2], offset);
      w7[2] = hc_bytealign (w1[0], w1[1], offset);
      w7[1] = hc_bytealign (w0[3], w1[0], offset);
      w7[0] = hc_bytealign (w0[2], w0[3], offset);
      w6[3] = hc_bytealign (w0[1], w0[2], offset);
      w6[2] = hc_bytealign (w0[0], w0[1], offset);
      w6[1] = hc_bytealign (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign (w1[0], w1[1], offset);
      w7[2] = hc_bytealign (w0[3], w1[0], offset);
      w7[1] = hc_bytealign (w0[2], w0[3], offset);
      w7[0] = hc_bytealign (w0[1], w0[2], offset);
      w6[3] = hc_bytealign (w0[0], w0[1], offset);
      w6[2] = hc_bytealign (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign (w0[3], w1[0], offset);
      w7[2] = hc_bytealign (w0[2], w0[3], offset);
      w7[1] = hc_bytealign (w0[1], w0[2], offset);
      w7[0] = hc_bytealign (w0[0], w0[1], offset);
      w6[3] = hc_bytealign (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign (w0[2], w0[3], offset);
      w7[2] = hc_bytealign (w0[1], w0[2], offset);
      w7[1] = hc_bytealign (w0[0], w0[1], offset);
      w7[0] = hc_bytealign (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign (w0[1], w0[2], offset);
      w7[2] = hc_bytealign (w0[0], w0[1], offset);
      w7[1] = hc_bytealign (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign (w0[0], w0[1], offset);
      w7[2] = hc_bytealign (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
  #endif

  switch (offset_switch)
  {
    case 0:
      w7[3] = hc_byte_perm (w7[2], w7[3], selector);
      w7[2] = hc_byte_perm (w7[1], w7[2], selector);
      w7[1] = hc_byte_perm (w7[0], w7[1], selector);
      w7[0] = hc_byte_perm (w6[3], w7[0], selector);
      w6[3] = hc_byte_perm (w6[2], w6[3], selector);
      w6[2] = hc_byte_perm (w6[1], w6[2], selector);
      w6[1] = hc_byte_perm (w6[0], w6[1], selector);
      w6[0] = hc_byte_perm (w5[3], w6[0], selector);
      w5[3] = hc_byte_perm (w5[2], w5[3], selector);
      w5[2] = hc_byte_perm (w5[1], w5[2], selector);
      w5[1] = hc_byte_perm (w5[0], w5[1], selector);
      w5[0] = hc_byte_perm (w4[3], w5[0], selector);
      w4[3] = hc_byte_perm (w4[2], w4[3], selector);
      w4[2] = hc_byte_perm (w4[1], w4[2], selector);
      w4[1] = hc_byte_perm (w4[0], w4[1], selector);
      w4[0] = hc_byte_perm (w3[3], w4[0], selector);
      w3[3] = hc_byte_perm (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm (    0, w0[0], selector);
      break;

    case 1:
      w7[3] = hc_byte_perm (w7[1], w7[2], selector);
      w7[2] = hc_byte_perm (w7[0], w7[1], selector);
      w7[1] = hc_byte_perm (w6[3], w7[0], selector);
      w7[0] = hc_byte_perm (w6[2], w6[3], selector);
      w6[3] = hc_byte_perm (w6[1], w6[2], selector);
      w6[2] = hc_byte_perm (w6[0], w6[1], selector);
      w6[1] = hc_byte_perm (w5[3], w6[0], selector);
      w6[0] = hc_byte_perm (w5[2], w5[3], selector);
      w5[3] = hc_byte_perm (w5[1], w5[2], selector);
      w5[2] = hc_byte_perm (w5[0], w5[1], selector);
      w5[1] = hc_byte_perm (w4[3], w5[0], selector);
      w5[0] = hc_byte_perm (w4[2], w4[3], selector);
      w4[3] = hc_byte_perm (w4[1], w4[2], selector);
      w4[2] = hc_byte_perm (w4[0], w4[1], selector);
      w4[1] = hc_byte_perm (w3[3], w4[0], selector);
      w4[0] = hc_byte_perm (w3[2], w3[3], selector);
      w3[3] = hc_byte_perm (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm (    0, w0[0], selector);
      w0[0] = 0;
      break;

    case 2:
      w7[3] = hc_byte_perm (w7[0], w7[1], selector);
      w7[2] = hc_byte_perm (w6[3], w7[0], selector);
      w7[1] = hc_byte_perm (w6[2], w6[3], selector);
      w7[0] = hc_byte_perm (w6[1], w6[2], selector);
      w6[3] = hc_byte_perm (w6[0], w6[1], selector);
      w6[2] = hc_byte_perm (w5[3], w6[0], selector);
      w6[1] = hc_byte_perm (w5[2], w5[3], selector);
      w6[0] = hc_byte_perm (w5[1], w5[2], selector);
      w5[3] = hc_byte_perm (w5[0], w5[1], selector);
      w5[2] = hc_byte_perm (w4[3], w5[0], selector);
      w5[1] = hc_byte_perm (w4[2], w4[3], selector);
      w5[0] = hc_byte_perm (w4[1], w4[2], selector);
      w4[3] = hc_byte_perm (w4[0], w4[1], selector);
      w4[2] = hc_byte_perm (w3[3], w4[0], selector);
      w4[1] = hc_byte_perm (w3[2], w3[3], selector);
      w4[0] = hc_byte_perm (w3[1], w3[2], selector);
      w3[3] = hc_byte_perm (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 3:
      w7[3] = hc_byte_perm (w6[3], w7[0], selector);
      w7[2] = hc_byte_perm (w6[2], w6[3], selector);
      w7[1] = hc_byte_perm (w6[1], w6[2], selector);
      w7[0] = hc_byte_perm (w6[0], w6[1], selector);
      w6[3] = hc_byte_perm (w5[3], w6[0], selector);
      w6[2] = hc_byte_perm (w5[2], w5[3], selector);
      w6[1] = hc_byte_perm (w5[1], w5[2], selector);
      w6[0] = hc_byte_perm (w5[0], w5[1], selector);
      w5[3] = hc_byte_perm (w4[3], w5[0], selector);
      w5[2] = hc_byte_perm (w4[2], w4[3], selector);
      w5[1] = hc_byte_perm (w4[1], w4[2], selector);
      w5[0] = hc_byte_perm (w4[0], w4[1], selector);
      w4[3] = hc_byte_perm (w3[3], w4[0], selector);
      w4[2] = hc_byte_perm (w3[2], w3[3], selector);
      w4[1] = hc_byte_perm (w3[1], w3[2], selector);
      w4[0] = hc_byte_perm (w3[0], w3[1], selector);
      w3[3] = hc_byte_perm (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 4:
      w7[3] = hc_byte_perm (w6[2], w6[3], selector);
      w7[2] = hc_byte_perm (w6[1], w6[2], selector);
      w7[1] = hc_byte_perm (w6[0], w6[1], selector);
      w7[0] = hc_byte_perm (w5[3], w6[0], selector);
      w6[3] = hc_byte_perm (w5[2], w5[3], selector);
      w6[2] = hc_byte_perm (w5[1], w5[2], selector);
      w6[1] = hc_byte_perm (w5[0], w5[1], selector);
      w6[0] = hc_byte_perm (w4[3], w5[0], selector);
      w5[3] = hc_byte_perm (w4[2], w4[3], selector);
      w5[2] = hc_byte_perm (w4[1], w4[2], selector);
      w5[1] = hc_byte_perm (w4[0], w4[1], selector);
      w5[0] = hc_byte_perm (w3[3], w4[0], selector);
      w4[3] = hc_byte_perm (w3[2], w3[3], selector);
      w4[2] = hc_byte_perm (w3[1], w3[2], selector);
      w4[1] = hc_byte_perm (w3[0], w3[1], selector);
      w4[0] = hc_byte_perm (w2[3], w3[0], selector);
      w3[3] = hc_byte_perm (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 5:
      w7[3] = hc_byte_perm (w6[1], w6[2], selector);
      w7[2] = hc_byte_perm (w6[0], w6[1], selector);
      w7[1] = hc_byte_perm (w5[3], w6[0], selector);
      w7[0] = hc_byte_perm (w5[2], w5[3], selector);
      w6[3] = hc_byte_perm (w5[1], w5[2], selector);
      w6[2] = hc_byte_perm (w5[0], w5[1], selector);
      w6[1] = hc_byte_perm (w4[3], w5[0], selector);
      w6[0] = hc_byte_perm (w4[2], w4[3], selector);
      w5[3] = hc_byte_perm (w4[1], w4[2], selector);
      w5[2] = hc_byte_perm (w4[0], w4[1], selector);
      w5[1] = hc_byte_perm (w3[3], w4[0], selector);
      w5[0] = hc_byte_perm (w3[2], w3[3], selector);
      w4[3] = hc_byte_perm (w3[1], w3[2], selector);
      w4[2] = hc_byte_perm (w3[0], w3[1], selector);
      w4[1] = hc_byte_perm (w2[3], w3[0], selector);
      w4[0] = hc_byte_perm (w2[2], w2[3], selector);
      w3[3] = hc_byte_perm (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 6:
      w7[3] = hc_byte_perm (w6[0], w6[1], selector);
      w7[2] = hc_byte_perm (w5[3], w6[0], selector);
      w7[1] = hc_byte_perm (w5[2], w5[3], selector);
      w7[0] = hc_byte_perm (w5[1], w5[2], selector);
      w6[3] = hc_byte_perm (w5[0], w5[1], selector);
      w6[2] = hc_byte_perm (w4[3], w5[0], selector);
      w6[1] = hc_byte_perm (w4[2], w4[3], selector);
      w6[0] = hc_byte_perm (w4[1], w4[2], selector);
      w5[3] = hc_byte_perm (w4[0], w4[1], selector);
      w5[2] = hc_byte_perm (w3[3], w4[0], selector);
      w5[1] = hc_byte_perm (w3[2], w3[3], selector);
      w5[0] = hc_byte_perm (w3[1], w3[2], selector);
      w4[3] = hc_byte_perm (w3[0], w3[1], selector);
      w4[2] = hc_byte_perm (w2[3], w3[0], selector);
      w4[1] = hc_byte_perm (w2[2], w2[3], selector);
      w4[0] = hc_byte_perm (w2[1], w2[2], selector);
      w3[3] = hc_byte_perm (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 7:
      w7[3] = hc_byte_perm (w5[3], w6[0], selector);
      w7[2] = hc_byte_perm (w5[2], w5[3], selector);
      w7[1] = hc_byte_perm (w5[1], w5[2], selector);
      w7[0] = hc_byte_perm (w5[0], w5[1], selector);
      w6[3] = hc_byte_perm (w4[3], w5[0], selector);
      w6[2] = hc_byte_perm (w4[2], w4[3], selector);
      w6[1] = hc_byte_perm (w4[1], w4[2], selector);
      w6[0] = hc_byte_perm (w4[0], w4[1], selector);
      w5[3] = hc_byte_perm (w3[3], w4[0], selector);
      w5[2] = hc_byte_perm (w3[2], w3[3], selector);
      w5[1] = hc_byte_perm (w3[1], w3[2], selector);
      w5[0] = hc_byte_perm (w3[0], w3[1], selector);
      w4[3] = hc_byte_perm (w2[3], w3[0], selector);
      w4[2] = hc_byte_perm (w2[2], w2[3], selector);
      w4[1] = hc_byte_perm (w2[1], w2[2], selector);
      w4[0] = hc_byte_perm (w2[0], w2[1], selector);
      w3[3] = hc_byte_perm (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 8:
      w7[3] = hc_byte_perm (w5[2], w5[3], selector);
      w7[2] = hc_byte_perm (w5[1], w5[2], selector);
      w7[1] = hc_byte_perm (w5[0], w5[1], selector);
      w7[0] = hc_byte_perm (w4[3], w5[0], selector);
      w6[3] = hc_byte_perm (w4[2], w4[3], selector);
      w6[2] = hc_byte_perm (w4[1], w4[2], selector);
      w6[1] = hc_byte_perm (w4[0], w4[1], selector);
      w6[0] = hc_byte_perm (w3[3], w4[0], selector);
      w5[3] = hc_byte_perm (w3[2], w3[3], selector);
      w5[2] = hc_byte_perm (w3[1], w3[2], selector);
      w5[1] = hc_byte_perm (w3[0], w3[1], selector);
      w5[0] = hc_byte_perm (w2[3], w3[0], selector);
      w4[3] = hc_byte_perm (w2[2], w2[3], selector);
      w4[2] = hc_byte_perm (w2[1], w2[2], selector);
      w4[1] = hc_byte_perm (w2[0], w2[1], selector);
      w4[0] = hc_byte_perm (w1[3], w2[0], selector);
      w3[3] = hc_byte_perm (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 9:
      w7[3] = hc_byte_perm (w5[1], w5[2], selector);
      w7[2] = hc_byte_perm (w5[0], w5[1], selector);
      w7[1] = hc_byte_perm (w4[3], w5[0], selector);
      w7[0] = hc_byte_perm (w4[2], w4[3], selector);
      w6[3] = hc_byte_perm (w4[1], w4[2], selector);
      w6[2] = hc_byte_perm (w4[0], w4[1], selector);
      w6[1] = hc_byte_perm (w3[3], w4[0], selector);
      w6[0] = hc_byte_perm (w3[2], w3[3], selector);
      w5[3] = hc_byte_perm (w3[1], w3[2], selector);
      w5[2] = hc_byte_perm (w3[0], w3[1], selector);
      w5[1] = hc_byte_perm (w2[3], w3[0], selector);
      w5[0] = hc_byte_perm (w2[2], w2[3], selector);
      w4[3] = hc_byte_perm (w2[1], w2[2], selector);
      w4[2] = hc_byte_perm (w2[0], w2[1], selector);
      w4[1] = hc_byte_perm (w1[3], w2[0], selector);
      w4[0] = hc_byte_perm (w1[2], w1[3], selector);
      w3[3] = hc_byte_perm (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 10:
      w7[3] = hc_byte_perm (w5[0], w5[1], selector);
      w7[2] = hc_byte_perm (w4[3], w5[0], selector);
      w7[1] = hc_byte_perm (w4[2], w4[3], selector);
      w7[0] = hc_byte_perm (w4[1], w4[2], selector);
      w6[3] = hc_byte_perm (w4[0], w4[1], selector);
      w6[2] = hc_byte_perm (w3[3], w4[0], selector);
      w6[1] = hc_byte_perm (w3[2], w3[3], selector);
      w6[0] = hc_byte_perm (w3[1], w3[2], selector);
      w5[3] = hc_byte_perm (w3[0], w3[1], selector);
      w5[2] = hc_byte_perm (w2[3], w3[0], selector);
      w5[1] = hc_byte_perm (w2[2], w2[3], selector);
      w5[0] = hc_byte_perm (w2[1], w2[2], selector);
      w4[3] = hc_byte_perm (w2[0], w2[1], selector);
      w4[2] = hc_byte_perm (w1[3], w2[0], selector);
      w4[1] = hc_byte_perm (w1[2], w1[3], selector);
      w4[0] = hc_byte_perm (w1[1], w1[2], selector);
      w3[3] = hc_byte_perm (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 11:
      w7[3] = hc_byte_perm (w4[3], w5[0], selector);
      w7[2] = hc_byte_perm (w4[2], w4[3], selector);
      w7[1] = hc_byte_perm (w4[1], w4[2], selector);
      w7[0] = hc_byte_perm (w4[0], w4[1], selector);
      w6[3] = hc_byte_perm (w3[3], w4[0], selector);
      w6[2] = hc_byte_perm (w3[2], w3[3], selector);
      w6[1] = hc_byte_perm (w3[1], w3[2], selector);
      w6[0] = hc_byte_perm (w3[0], w3[1], selector);
      w5[3] = hc_byte_perm (w2[3], w3[0], selector);
      w5[2] = hc_byte_perm (w2[2], w2[3], selector);
      w5[1] = hc_byte_perm (w2[1], w2[2], selector);
      w5[0] = hc_byte_perm (w2[0], w2[1], selector);
      w4[3] = hc_byte_perm (w1[3], w2[0], selector);
      w4[2] = hc_byte_perm (w1[2], w1[3], selector);
      w4[1] = hc_byte_perm (w1[1], w1[2], selector);
      w4[0] = hc_byte_perm (w1[0], w1[1], selector);
      w3[3] = hc_byte_perm (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 12:
      w7[3] = hc_byte_perm (w4[2], w4[3], selector);
      w7[2] = hc_byte_perm (w4[1], w4[2], selector);
      w7[1] = hc_byte_perm (w4[0], w4[1], selector);
      w7[0] = hc_byte_perm (w3[3], w4[0], selector);
      w6[3] = hc_byte_perm (w3[2], w3[3], selector);
      w6[2] = hc_byte_perm (w3[1], w3[2], selector);
      w6[1] = hc_byte_perm (w3[0], w3[1], selector);
      w6[0] = hc_byte_perm (w2[3], w3[0], selector);
      w5[3] = hc_byte_perm (w2[2], w2[3], selector);
      w5[2] = hc_byte_perm (w2[1], w2[2], selector);
      w5[1] = hc_byte_perm (w2[0], w2[1], selector);
      w5[0] = hc_byte_perm (w1[3], w2[0], selector);
      w4[3] = hc_byte_perm (w1[2], w1[3], selector);
      w4[2] = hc_byte_perm (w1[1], w1[2], selector);
      w4[1] = hc_byte_perm (w1[0], w1[1], selector);
      w4[0] = hc_byte_perm (w0[3], w1[0], selector);
      w3[3] = hc_byte_perm (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 13:
      w7[3] = hc_byte_perm (w4[1], w4[2], selector);
      w7[2] = hc_byte_perm (w4[0], w4[1], selector);
      w7[1] = hc_byte_perm (w3[3], w4[0], selector);
      w7[0] = hc_byte_perm (w3[2], w3[3], selector);
      w6[3] = hc_byte_perm (w3[1], w3[2], selector);
      w6[2] = hc_byte_perm (w3[0], w3[1], selector);
      w6[1] = hc_byte_perm (w2[3], w3[0], selector);
      w6[0] = hc_byte_perm (w2[2], w2[3], selector);
      w5[3] = hc_byte_perm (w2[1], w2[2], selector);
      w5[2] = hc_byte_perm (w2[0], w2[1], selector);
      w5[1] = hc_byte_perm (w1[3], w2[0], selector);
      w5[0] = hc_byte_perm (w1[2], w1[3], selector);
      w4[3] = hc_byte_perm (w1[1], w1[2], selector);
      w4[2] = hc_byte_perm (w1[0], w1[1], selector);
      w4[1] = hc_byte_perm (w0[3], w1[0], selector);
      w4[0] = hc_byte_perm (w0[2], w0[3], selector);
      w3[3] = hc_byte_perm (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 14:
      w7[3] = hc_byte_perm (w4[0], w4[1], selector);
      w7[2] = hc_byte_perm (w3[3], w4[0], selector);
      w7[1] = hc_byte_perm (w3[2], w3[3], selector);
      w7[0] = hc_byte_perm (w3[1], w3[2], selector);
      w6[3] = hc_byte_perm (w3[0], w3[1], selector);
      w6[2] = hc_byte_perm (w2[3], w3[0], selector);
      w6[1] = hc_byte_perm (w2[2], w2[3], selector);
      w6[0] = hc_byte_perm (w2[1], w2[2], selector);
      w5[3] = hc_byte_perm (w2[0], w2[1], selector);
      w5[2] = hc_byte_perm (w1[3], w2[0], selector);
      w5[1] = hc_byte_perm (w1[2], w1[3], selector);
      w5[0] = hc_byte_perm (w1[1], w1[2], selector);
      w4[3] = hc_byte_perm (w1[0], w1[1], selector);
      w4[2] = hc_byte_perm (w0[3], w1[0], selector);
      w4[1] = hc_byte_perm (w0[2], w0[3], selector);
      w4[0] = hc_byte_perm (w0[1], w0[2], selector);
      w3[3] = hc_byte_perm (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;

    case 15:
      w7[3] = hc_byte_perm (w3[3], w4[0], selector);
      w7[2] = hc_byte_perm (w3[2], w3[3], selector);
      w7[1] = hc_byte_perm (w3[1], w3[2], selector);
      w7[0] = hc_byte_perm (w3[0], w3[1], selector);
      w6[3] = hc_byte_perm (w2[3], w3[0], selector);
      w6[2] = hc_byte_perm (w2[2], w2[3], selector);
      w6[1] = hc_byte_perm (w2[1], w2[2], selector);
      w6[0] = hc_byte_perm (w2[0], w2[1], selector);
      w5[3] = hc_byte_perm (w1[3], w2[0], selector);
      w5[2] = hc_byte_perm (w1[2], w1[3], selector);
      w5[1] = hc_byte_perm (w1[1], w1[2], selector);
      w5[0] = hc_byte_perm (w1[0], w1[1], selector);
      w4[3] = hc_byte_perm (w0[3], w1[0], selector);
      w4[2] = hc_byte_perm (w0[2], w0[3], selector);
      w4[1] = hc_byte_perm (w0[1], w0[2], selector);
      w4[0] = hc_byte_perm (w0[0], w0[1], selector);
      w3[3] = hc_byte_perm (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;
      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign (w7[3],     0, offset);
      w7[3] = hc_bytealign (w7[2], w7[3], offset);
      w7[2] = hc_bytealign (w7[1], w7[2], offset);
      w7[1] = hc_bytealign (w7[0], w7[1], offset);
      w7[0] = hc_bytealign (w6[3], w7[0], offset);
      w6[3] = hc_bytealign (w6[2], w6[3], offset);
      w6[2] = hc_bytealign (w6[1], w6[2], offset);
      w6[1] = hc_bytealign (w6[0], w6[1], offset);
      w6[0] = hc_bytealign (w5[3], w6[0], offset);
      w5[3] = hc_bytealign (w5[2], w5[3], offset);
      w5[2] = hc_bytealign (w5[1], w5[2], offset);
      w5[1] = hc_bytealign (w5[0], w5[1], offset);
      w5[0] = hc_bytealign (w4[3], w5[0], offset);
      w4[3] = hc_bytealign (w4[2], w4[3], offset);
      w4[2] = hc_bytealign (w4[1], w4[2], offset);
      w4[1] = hc_bytealign (w4[0], w4[1], offset);
      w4[0] = hc_bytealign (w3[3], w4[0], offset);
      w3[3] = hc_bytealign (w3[2], w3[3], offset);
      w3[2] = hc_bytealign (w3[1], w3[2], offset);
      w3[1] = hc_bytealign (w3[0], w3[1], offset);
      w3[0] = hc_bytealign (w2[3], w3[0], offset);
      w2[3] = hc_bytealign (w2[2], w2[3], offset);
      w2[2] = hc_bytealign (w2[1], w2[2], offset);
      w2[1] = hc_bytealign (w2[0], w2[1], offset);
      w2[0] = hc_bytealign (w1[3], w2[0], offset);
      w1[3] = hc_bytealign (w1[2], w1[3], offset);
      w1[2] = hc_bytealign (w1[1], w1[2], offset);
      w1[1] = hc_bytealign (w1[0], w1[1], offset);
      w1[0] = hc_bytealign (w0[3], w1[0], offset);
      w0[3] = hc_bytealign (w0[2], w0[3], offset);
      w0[2] = hc_bytealign (w0[1], w0[2], offset);
      w0[1] = hc_bytealign (w0[0], w0[1], offset);
      w0[0] = hc_bytealign (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign (w7[3],     0, offset);
      c0[0] = hc_bytealign (w7[2], w7[3], offset);
      w7[3] = hc_bytealign (w7[1], w7[2], offset);
      w7[2] = hc_bytealign (w7[0], w7[1], offset);
      w7[1] = hc_bytealign (w6[3], w7[0], offset);
      w7[0] = hc_bytealign (w6[2], w6[3], offset);
      w6[3] = hc_bytealign (w6[1], w6[2], offset);
      w6[2] = hc_bytealign (w6[0], w6[1], offset);
      w6[1] = hc_bytealign (w5[3], w6[0], offset);
      w6[0] = hc_bytealign (w5[2], w5[3], offset);
      w5[3] = hc_bytealign (w5[1], w5[2], offset);
      w5[2] = hc_bytealign (w5[0], w5[1], offset);
      w5[1] = hc_bytealign (w4[3], w5[0], offset);
      w5[0] = hc_bytealign (w4[2], w4[3], offset);
      w4[3] = hc_bytealign (w4[1], w4[2], offset);
      w4[2] = hc_bytealign (w4[0], w4[1], offset);
      w4[1] = hc_bytealign (w3[3], w4[0], offset);
      w4[0] = hc_bytealign (w3[2], w3[3], offset);
      w3[3] = hc_bytealign (w3[1], w3[2], offset);
      w3[2] = hc_bytealign (w3[0], w3[1], offset);
      w3[1] = hc_bytealign (w2[3], w3[0], offset);
      w3[0] = hc_bytealign (w2[2], w2[3], offset);
      w2[3] = hc_bytealign (w2[1], w2[2], offset);
      w2[2] = hc_bytealign (w2[0], w2[1], offset);
      w2[1] = hc_bytealign (w1[3], w2[0], offset);
      w2[0] = hc_bytealign (w1[2], w1[3], offset);
      w1[3] = hc_bytealign (w1[1], w1[2], offset);
      w1[2] = hc_bytealign (w1[0], w1[1], offset);
      w1[1] = hc_bytealign (w0[3], w1[0], offset);
      w1[0] = hc_bytealign (w0[2], w0[3], offset);
      w0[3] = hc_bytealign (w0[1], w0[2], offset);
      w0[2] = hc_bytealign (w0[0], w0[1], offset);
      w0[1] = hc_bytealign (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign (w7[3],     0, offset);
      c0[1] = hc_bytealign (w7[2], w7[3], offset);
      c0[0] = hc_bytealign (w7[1], w7[2], offset);
      w7[3] = hc_bytealign (w7[0], w7[1], offset);
      w7[2] = hc_bytealign (w6[3], w7[0], offset);
      w7[1] = hc_bytealign (w6[2], w6[3], offset);
      w7[0] = hc_bytealign (w6[1], w6[2], offset);
      w6[3] = hc_bytealign (w6[0], w6[1], offset);
      w6[2] = hc_bytealign (w5[3], w6[0], offset);
      w6[1] = hc_bytealign (w5[2], w5[3], offset);
      w6[0] = hc_bytealign (w5[1], w5[2], offset);
      w5[3] = hc_bytealign (w5[0], w5[1], offset);
      w5[2] = hc_bytealign (w4[3], w5[0], offset);
      w5[1] = hc_bytealign (w4[2], w4[3], offset);
      w5[0] = hc_bytealign (w4[1], w4[2], offset);
      w4[3] = hc_bytealign (w4[0], w4[1], offset);
      w4[2] = hc_bytealign (w3[3], w4[0], offset);
      w4[1] = hc_bytealign (w3[2], w3[3], offset);
      w4[0] = hc_bytealign (w3[1], w3[2], offset);
      w3[3] = hc_bytealign (w3[0], w3[1], offset);
      w3[2] = hc_bytealign (w2[3], w3[0], offset);
      w3[1] = hc_bytealign (w2[2], w2[3], offset);
      w3[0] = hc_bytealign (w2[1], w2[2], offset);
      w2[3] = hc_bytealign (w2[0], w2[1], offset);
      w2[2] = hc_bytealign (w1[3], w2[0], offset);
      w2[1] = hc_bytealign (w1[2], w1[3], offset);
      w2[0] = hc_bytealign (w1[1], w1[2], offset);
      w1[3] = hc_bytealign (w1[0], w1[1], offset);
      w1[2] = hc_bytealign (w0[3], w1[0], offset);
      w1[1] = hc_bytealign (w0[2], w0[3], offset);
      w1[0] = hc_bytealign (w0[1], w0[2], offset);
      w0[3] = hc_bytealign (w0[0], w0[1], offset);
      w0[2] = hc_bytealign (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign (w7[3],     0, offset);
      c0[2] = hc_bytealign (w7[2], w7[3], offset);
      c0[1] = hc_bytealign (w7[1], w7[2], offset);
      c0[0] = hc_bytealign (w7[0], w7[1], offset);
      w7[3] = hc_bytealign (w6[3], w7[0], offset);
      w7[2] = hc_bytealign (w6[2], w6[3], offset);
      w7[1] = hc_bytealign (w6[1], w6[2], offset);
      w7[0] = hc_bytealign (w6[0], w6[1], offset);
      w6[3] = hc_bytealign (w5[3], w6[0], offset);
      w6[2] = hc_bytealign (w5[2], w5[3], offset);
      w6[1] = hc_bytealign (w5[1], w5[2], offset);
      w6[0] = hc_bytealign (w5[0], w5[1], offset);
      w5[3] = hc_bytealign (w4[3], w5[0], offset);
      w5[2] = hc_bytealign (w4[2], w4[3], offset);
      w5[1] = hc_bytealign (w4[1], w4[2], offset);
      w5[0] = hc_bytealign (w4[0], w4[1], offset);
      w4[3] = hc_bytealign (w3[3], w4[0], offset);
      w4[2] = hc_bytealign (w3[2], w3[3], offset);
      w4[1] = hc_bytealign (w3[1], w3[2], offset);
      w4[0] = hc_bytealign (w3[0], w3[1], offset);
      w3[3] = hc_bytealign (w2[3], w3[0], offset);
      w3[2] = hc_bytealign (w2[2], w2[3], offset);
      w3[1] = hc_bytealign (w2[1], w2[2], offset);
      w3[0] = hc_bytealign (w2[0], w2[1], offset);
      w2[3] = hc_bytealign (w1[3], w2[0], offset);
      w2[2] = hc_bytealign (w1[2], w1[3], offset);
      w2[1] = hc_bytealign (w1[1], w1[2], offset);
      w2[0] = hc_bytealign (w1[0], w1[1], offset);
      w1[3] = hc_bytealign (w0[3], w1[0], offset);
      w1[2] = hc_bytealign (w0[2], w0[3], offset);
      w1[1] = hc_bytealign (w0[1], w0[2], offset);
      w1[0] = hc_bytealign (w0[0], w0[1], offset);
      w0[3] = hc_bytealign (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign (w7[3],     0, offset);
      c0[3] = hc_bytealign (w7[2], w7[3], offset);
      c0[2] = hc_bytealign (w7[1], w7[2], offset);
      c0[1] = hc_bytealign (w7[0], w7[1], offset);
      c0[0] = hc_bytealign (w6[3], w7[0], offset);
      w7[3] = hc_bytealign (w6[2], w6[3], offset);
      w7[2] = hc_bytealign (w6[1], w6[2], offset);
      w7[1] = hc_bytealign (w6[0], w6[1], offset);
      w7[0] = hc_bytealign (w5[3], w6[0], offset);
      w6[3] = hc_bytealign (w5[2], w5[3], offset);
      w6[2] = hc_bytealign (w5[1], w5[2], offset);
      w6[1] = hc_bytealign (w5[0], w5[1], offset);
      w6[0] = hc_bytealign (w4[3], w5[0], offset);
      w5[3] = hc_bytealign (w4[2], w4[3], offset);
      w5[2] = hc_bytealign (w4[1], w4[2], offset);
      w5[1] = hc_bytealign (w4[0], w4[1], offset);
      w5[0] = hc_bytealign (w3[3], w4[0], offset);
      w4[3] = hc_bytealign (w3[2], w3[3], offset);
      w4[2] = hc_bytealign (w3[1], w3[2], offset);
      w4[1] = hc_bytealign (w3[0], w3[1], offset);
      w4[0] = hc_bytealign (w2[3], w3[0], offset);
      w3[3] = hc_bytealign (w2[2], w2[3], offset);
      w3[2] = hc_bytealign (w2[1], w2[2], offset);
      w3[1] = hc_bytealign (w2[0], w2[1], offset);
      w3[0] = hc_bytealign (w1[3], w2[0], offset);
      w2[3] = hc_bytealign (w1[2], w1[3], offset);
      w2[2] = hc_bytealign (w1[1], w1[2], offset);
      w2[1] = hc_bytealign (w1[0], w1[1], offset);
      w2[0] = hc_bytealign (w0[3], w1[0], offset);
      w1[3] = hc_bytealign (w0[2], w0[3], offset);
      w1[2] = hc_bytealign (w0[1], w0[2], offset);
      w1[1] = hc_bytealign (w0[0], w0[1], offset);
      w1[0] = hc_bytealign (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign (w7[3],     0, offset);
      c1[0] = hc_bytealign (w7[2], w7[3], offset);
      c0[3] = hc_bytealign (w7[1], w7[2], offset);
      c0[2] = hc_bytealign (w7[0], w7[1], offset);
      c0[1] = hc_bytealign (w6[3], w7[0], offset);
      c0[0] = hc_bytealign (w6[2], w6[3], offset);
      w7[3] = hc_bytealign (w6[1], w6[2], offset);
      w7[2] = hc_bytealign (w6[0], w6[1], offset);
      w7[1] = hc_bytealign (w5[3], w6[0], offset);
      w7[0] = hc_bytealign (w5[2], w5[3], offset);
      w6[3] = hc_bytealign (w5[1], w5[2], offset);
      w6[2] = hc_bytealign (w5[0], w5[1], offset);
      w6[1] = hc_bytealign (w4[3], w5[0], offset);
      w6[0] = hc_bytealign (w4[2], w4[3], offset);
      w5[3] = hc_bytealign (w4[1], w4[2], offset);
      w5[2] = hc_bytealign (w4[0], w4[1], offset);
      w5[1] = hc_bytealign (w3[3], w4[0], offset);
      w5[0] = hc_bytealign (w3[2], w3[3], offset);
      w4[3] = hc_bytealign (w3[1], w3[2], offset);
      w4[2] = hc_bytealign (w3[0], w3[1], offset);
      w4[1] = hc_bytealign (w2[3], w3[0], offset);
      w4[0] = hc_bytealign (w2[2], w2[3], offset);
      w3[3] = hc_bytealign (w2[1], w2[2], offset);
      w3[2] = hc_bytealign (w2[0], w2[1], offset);
      w3[1] = hc_bytealign (w1[3], w2[0], offset);
      w3[0] = hc_bytealign (w1[2], w1[3], offset);
      w2[3] = hc_bytealign (w1[1], w1[2], offset);
      w2[2] = hc_bytealign (w1[0], w1[1], offset);
      w2[1] = hc_bytealign (w0[3], w1[0], offset);
      w2[0] = hc_bytealign (w0[2], w0[3], offset);
      w1[3] = hc_bytealign (w0[1], w0[2], offset);
      w1[2] = hc_bytealign (w0[0], w0[1], offset);
      w1[1] = hc_bytealign (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign (w7[3],     0, offset);
      c1[1] = hc_bytealign (w7[2], w7[3], offset);
      c1[0] = hc_bytealign (w7[1], w7[2], offset);
      c0[3] = hc_bytealign (w7[0], w7[1], offset);
      c0[2] = hc_bytealign (w6[3], w7[0], offset);
      c0[1] = hc_bytealign (w6[2], w6[3], offset);
      c0[0] = hc_bytealign (w6[1], w6[2], offset);
      w7[3] = hc_bytealign (w6[0], w6[1], offset);
      w7[2] = hc_bytealign (w5[3], w6[0], offset);
      w7[1] = hc_bytealign (w5[2], w5[3], offset);
      w7[0] = hc_bytealign (w5[1], w5[2], offset);
      w6[3] = hc_bytealign (w5[0], w5[1], offset);
      w6[2] = hc_bytealign (w4[3], w5[0], offset);
      w6[1] = hc_bytealign (w4[2], w4[3], offset);
      w6[0] = hc_bytealign (w4[1], w4[2], offset);
      w5[3] = hc_bytealign (w4[0], w4[1], offset);
      w5[2] = hc_bytealign (w3[3], w4[0], offset);
      w5[1] = hc_bytealign (w3[2], w3[3], offset);
      w5[0] = hc_bytealign (w3[1], w3[2], offset);
      w4[3] = hc_bytealign (w3[0], w3[1], offset);
      w4[2] = hc_bytealign (w2[3], w3[0], offset);
      w4[1] = hc_bytealign (w2[2], w2[3], offset);
      w4[0] = hc_bytealign (w2[1], w2[2], offset);
      w3[3] = hc_bytealign (w2[0], w2[1], offset);
      w3[2] = hc_bytealign (w1[3], w2[0], offset);
      w3[1] = hc_bytealign (w1[2], w1[3], offset);
      w3[0] = hc_bytealign (w1[1], w1[2], offset);
      w2[3] = hc_bytealign (w1[0], w1[1], offset);
      w2[2] = hc_bytealign (w0[3], w1[0], offset);
      w2[1] = hc_bytealign (w0[2], w0[3], offset);
      w2[0] = hc_bytealign (w0[1], w0[2], offset);
      w1[3] = hc_bytealign (w0[0], w0[1], offset);
      w1[2] = hc_bytealign (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign (w7[3],     0, offset);
      c1[2] = hc_bytealign (w7[2], w7[3], offset);
      c1[1] = hc_bytealign (w7[1], w7[2], offset);
      c1[0] = hc_bytealign (w7[0], w7[1], offset);
      c0[3] = hc_bytealign (w6[3], w7[0], offset);
      c0[2] = hc_bytealign (w6[2], w6[3], offset);
      c0[1] = hc_bytealign (w6[1], w6[2], offset);
      c0[0] = hc_bytealign (w6[0], w6[1], offset);
      w7[3] = hc_bytealign (w5[3], w6[0], offset);
      w7[2] = hc_bytealign (w5[2], w5[3], offset);
      w7[1] = hc_bytealign (w5[1], w5[2], offset);
      w7[0] = hc_bytealign (w5[0], w5[1], offset);
      w6[3] = hc_bytealign (w4[3], w5[0], offset);
      w6[2] = hc_bytealign (w4[2], w4[3], offset);
      w6[1] = hc_bytealign (w4[1], w4[2], offset);
      w6[0] = hc_bytealign (w4[0], w4[1], offset);
      w5[3] = hc_bytealign (w3[3], w4[0], offset);
      w5[2] = hc_bytealign (w3[2], w3[3], offset);
      w5[1] = hc_bytealign (w3[1], w3[2], offset);
      w5[0] = hc_bytealign (w3[0], w3[1], offset);
      w4[3] = hc_bytealign (w2[3], w3[0], offset);
      w4[2] = hc_bytealign (w2[2], w2[3], offset);
      w4[1] = hc_bytealign (w2[1], w2[2], offset);
      w4[0] = hc_bytealign (w2[0], w2[1], offset);
      w3[3] = hc_bytealign (w1[3], w2[0], offset);
      w3[2] = hc_bytealign (w1[2], w1[3], offset);
      w3[1] = hc_bytealign (w1[1], w1[2], offset);
      w3[0] = hc_bytealign (w1[0], w1[1], offset);
      w2[3] = hc_bytealign (w0[3], w1[0], offset);
      w2[2] = hc_bytealign (w0[2], w0[3], offset);
      w2[1] = hc_bytealign (w0[1], w0[2], offset);
      w2[0] = hc_bytealign (w0[0], w0[1], offset);
      w1[3] = hc_bytealign (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign (w7[3],     0, offset);
      c1[3] = hc_bytealign (w7[2], w7[3], offset);
      c1[2] = hc_bytealign (w7[1], w7[2], offset);
      c1[1] = hc_bytealign (w7[0], w7[1], offset);
      c1[0] = hc_bytealign (w6[3], w7[0], offset);
      c0[3] = hc_bytealign (w6[2], w6[3], offset);
      c0[2] = hc_bytealign (w6[1], w6[2], offset);
      c0[1] = hc_bytealign (w6[0], w6[1], offset);
      c0[0] = hc_bytealign (w5[3], w6[0], offset);
      w7[3] = hc_bytealign (w5[2], w5[3], offset);
      w7[2] = hc_bytealign (w5[1], w5[2], offset);
      w7[1] = hc_bytealign (w5[0], w5[1], offset);
      w7[0] = hc_bytealign (w4[3], w5[0], offset);
      w6[3] = hc_bytealign (w4[2], w4[3], offset);
      w6[2] = hc_bytealign (w4[1], w4[2], offset);
      w6[1] = hc_bytealign (w4[0], w4[1], offset);
      w6[0] = hc_bytealign (w3[3], w4[0], offset);
      w5[3] = hc_bytealign (w3[2], w3[3], offset);
      w5[2] = hc_bytealign (w3[1], w3[2], offset);
      w5[1] = hc_bytealign (w3[0], w3[1], offset);
      w5[0] = hc_bytealign (w2[3], w3[0], offset);
      w4[3] = hc_bytealign (w2[2], w2[3], offset);
      w4[2] = hc_bytealign (w2[1], w2[2], offset);
      w4[1] = hc_bytealign (w2[0], w2[1], offset);
      w4[0] = hc_bytealign (w1[3], w2[0], offset);
      w3[3] = hc_bytealign (w1[2], w1[3], offset);
      w3[2] = hc_bytealign (w1[1], w1[2], offset);
      w3[1] = hc_bytealign (w1[0], w1[1], offset);
      w3[0] = hc_bytealign (w0[3], w1[0], offset);
      w2[3] = hc_bytealign (w0[2], w0[3], offset);
      w2[2] = hc_bytealign (w0[1], w0[2], offset);
      w2[1] = hc_bytealign (w0[0], w0[1], offset);
      w2[0] = hc_bytealign (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign (w7[3],     0, offset);
      c2[0] = hc_bytealign (w7[2], w7[3], offset);
      c1[3] = hc_bytealign (w7[1], w7[2], offset);
      c1[2] = hc_bytealign (w7[0], w7[1], offset);
      c1[1] = hc_bytealign (w6[3], w7[0], offset);
      c1[0] = hc_bytealign (w6[2], w6[3], offset);
      c0[3] = hc_bytealign (w6[1], w6[2], offset);
      c0[2] = hc_bytealign (w6[0], w6[1], offset);
      c0[1] = hc_bytealign (w5[3], w6[0], offset);
      c0[0] = hc_bytealign (w5[2], w5[3], offset);
      w7[3] = hc_bytealign (w5[1], w5[2], offset);
      w7[2] = hc_bytealign (w5[0], w5[1], offset);
      w7[1] = hc_bytealign (w4[3], w5[0], offset);
      w7[0] = hc_bytealign (w4[2], w4[3], offset);
      w6[3] = hc_bytealign (w4[1], w4[2], offset);
      w6[2] = hc_bytealign (w4[0], w4[1], offset);
      w6[1] = hc_bytealign (w3[3], w4[0], offset);
      w6[0] = hc_bytealign (w3[2], w3[3], offset);
      w5[3] = hc_bytealign (w3[1], w3[2], offset);
      w5[2] = hc_bytealign (w3[0], w3[1], offset);
      w5[1] = hc_bytealign (w2[3], w3[0], offset);
      w5[0] = hc_bytealign (w2[2], w2[3], offset);
      w4[3] = hc_bytealign (w2[1], w2[2], offset);
      w4[2] = hc_bytealign (w2[0], w2[1], offset);
      w4[1] = hc_bytealign (w1[3], w2[0], offset);
      w4[0] = hc_bytealign (w1[2], w1[3], offset);
      w3[3] = hc_bytealign (w1[1], w1[2], offset);
      w3[2] = hc_bytealign (w1[0], w1[1], offset);
      w3[1] = hc_bytealign (w0[3], w1[0], offset);
      w3[0] = hc_bytealign (w0[2], w0[3], offset);
      w2[3] = hc_bytealign (w0[1], w0[2], offset);
      w2[2] = hc_bytealign (w0[0], w0[1], offset);
      w2[1] = hc_bytealign (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign (w7[3],     0, offset);
      c2[1] = hc_bytealign (w7[2], w7[3], offset);
      c2[0] = hc_bytealign (w7[1], w7[2], offset);
      c1[3] = hc_bytealign (w7[0], w7[1], offset);
      c1[2] = hc_bytealign (w6[3], w7[0], offset);
      c1[1] = hc_bytealign (w6[2], w6[3], offset);
      c1[0] = hc_bytealign (w6[1], w6[2], offset);
      c0[3] = hc_bytealign (w6[0], w6[1], offset);
      c0[2] = hc_bytealign (w5[3], w6[0], offset);
      c0[1] = hc_bytealign (w5[2], w5[3], offset);
      c0[0] = hc_bytealign (w5[1], w5[2], offset);
      w7[3] = hc_bytealign (w5[0], w5[1], offset);
      w7[2] = hc_bytealign (w4[3], w5[0], offset);
      w7[1] = hc_bytealign (w4[2], w4[3], offset);
      w7[0] = hc_bytealign (w4[1], w4[2], offset);
      w6[3] = hc_bytealign (w4[0], w4[1], offset);
      w6[2] = hc_bytealign (w3[3], w4[0], offset);
      w6[1] = hc_bytealign (w3[2], w3[3], offset);
      w6[0] = hc_bytealign (w3[1], w3[2], offset);
      w5[3] = hc_bytealign (w3[0], w3[1], offset);
      w5[2] = hc_bytealign (w2[3], w3[0], offset);
      w5[1] = hc_bytealign (w2[2], w2[3], offset);
      w5[0] = hc_bytealign (w2[1], w2[2], offset);
      w4[3] = hc_bytealign (w2[0], w2[1], offset);
      w4[2] = hc_bytealign (w1[3], w2[0], offset);
      w4[1] = hc_bytealign (w1[2], w1[3], offset);
      w4[0] = hc_bytealign (w1[1], w1[2], offset);
      w3[3] = hc_bytealign (w1[0], w1[1], offset);
      w3[2] = hc_bytealign (w0[3], w1[0], offset);
      w3[1] = hc_bytealign (w0[2], w0[3], offset);
      w3[0] = hc_bytealign (w0[1], w0[2], offset);
      w2[3] = hc_bytealign (w0[0], w0[1], offset);
      w2[2] = hc_bytealign (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign (w7[3],     0, offset);
      c2[2] = hc_bytealign (w7[2], w7[3], offset);
      c2[1] = hc_bytealign (w7[1], w7[2], offset);
      c2[0] = hc_bytealign (w7[0], w7[1], offset);
      c1[3] = hc_bytealign (w6[3], w7[0], offset);
      c1[2] = hc_bytealign (w6[2], w6[3], offset);
      c1[1] = hc_bytealign (w6[1], w6[2], offset);
      c1[0] = hc_bytealign (w6[0], w6[1], offset);
      c0[3] = hc_bytealign (w5[3], w6[0], offset);
      c0[2] = hc_bytealign (w5[2], w5[3], offset);
      c0[1] = hc_bytealign (w5[1], w5[2], offset);
      c0[0] = hc_bytealign (w5[0], w5[1], offset);
      w7[3] = hc_bytealign (w4[3], w5[0], offset);
      w7[2] = hc_bytealign (w4[2], w4[3], offset);
      w7[1] = hc_bytealign (w4[1], w4[2], offset);
      w7[0] = hc_bytealign (w4[0], w4[1], offset);
      w6[3] = hc_bytealign (w3[3], w4[0], offset);
      w6[2] = hc_bytealign (w3[2], w3[3], offset);
      w6[1] = hc_bytealign (w3[1], w3[2], offset);
      w6[0] = hc_bytealign (w3[0], w3[1], offset);
      w5[3] = hc_bytealign (w2[3], w3[0], offset);
      w5[2] = hc_bytealign (w2[2], w2[3], offset);
      w5[1] = hc_bytealign (w2[1], w2[2], offset);
      w5[0] = hc_bytealign (w2[0], w2[1], offset);
      w4[3] = hc_bytealign (w1[3], w2[0], offset);
      w4[2] = hc_bytealign (w1[2], w1[3], offset);
      w4[1] = hc_bytealign (w1[1], w1[2], offset);
      w4[0] = hc_bytealign (w1[0], w1[1], offset);
      w3[3] = hc_bytealign (w0[3], w1[0], offset);
      w3[2] = hc_bytealign (w0[2], w0[3], offset);
      w3[1] = hc_bytealign (w0[1], w0[2], offset);
      w3[0] = hc_bytealign (w0[0], w0[1], offset);
      w2[3] = hc_bytealign (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign (w7[3],     0, offset);
      c2[3] = hc_bytealign (w7[2], w7[3], offset);
      c2[2] = hc_bytealign (w7[1], w7[2], offset);
      c2[1] = hc_bytealign (w7[0], w7[1], offset);
      c2[0] = hc_bytealign (w6[3], w7[0], offset);
      c1[3] = hc_bytealign (w6[2], w6[3], offset);
      c1[2] = hc_bytealign (w6[1], w6[2], offset);
      c1[1] = hc_bytealign (w6[0], w6[1], offset);
      c1[0] = hc_bytealign (w5[3], w6[0], offset);
      c0[3] = hc_bytealign (w5[2], w5[3], offset);
      c0[2] = hc_bytealign (w5[1], w5[2], offset);
      c0[1] = hc_bytealign (w5[0], w5[1], offset);
      c0[0] = hc_bytealign (w4[3], w5[0], offset);
      w7[3] = hc_bytealign (w4[2], w4[3], offset);
      w7[2] = hc_bytealign (w4[1], w4[2], offset);
      w7[1] = hc_bytealign (w4[0], w4[1], offset);
      w7[0] = hc_bytealign (w3[3], w4[0], offset);
      w6[3] = hc_bytealign (w3[2], w3[3], offset);
      w6[2] = hc_bytealign (w3[1], w3[2], offset);
      w6[1] = hc_bytealign (w3[0], w3[1], offset);
      w6[0] = hc_bytealign (w2[3], w3[0], offset);
      w5[3] = hc_bytealign (w2[2], w2[3], offset);
      w5[2] = hc_bytealign (w2[1], w2[2], offset);
      w5[1] = hc_bytealign (w2[0], w2[1], offset);
      w5[0] = hc_bytealign (w1[3], w2[0], offset);
      w4[3] = hc_bytealign (w1[2], w1[3], offset);
      w4[2] = hc_bytealign (w1[1], w1[2], offset);
      w4[1] = hc_bytealign (w1[0], w1[1], offset);
      w4[0] = hc_bytealign (w0[3], w1[0], offset);
      w3[3] = hc_bytealign (w0[2], w0[3], offset);
      w3[2] = hc_bytealign (w0[1], w0[2], offset);
      w3[1] = hc_bytealign (w0[0], w0[1], offset);
      w3[0] = hc_bytealign (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign (w7[3],     0, offset);
      c3[0] = hc_bytealign (w7[2], w7[3], offset);
      c2[3] = hc_bytealign (w7[1], w7[2], offset);
      c2[2] = hc_bytealign (w7[0], w7[1], offset);
      c2[1] = hc_bytealign (w6[3], w7[0], offset);
      c2[0] = hc_bytealign (w6[2], w6[3], offset);
      c1[3] = hc_bytealign (w6[1], w6[2], offset);
      c1[2] = hc_bytealign (w6[0], w6[1], offset);
      c1[1] = hc_bytealign (w5[3], w6[0], offset);
      c1[0] = hc_bytealign (w5[2], w5[3], offset);
      c0[3] = hc_bytealign (w5[1], w5[2], offset);
      c0[2] = hc_bytealign (w5[0], w5[1], offset);
      c0[1] = hc_bytealign (w4[3], w5[0], offset);
      c0[0] = hc_bytealign (w4[2], w4[3], offset);
      w7[3] = hc_bytealign (w4[1], w4[2], offset);
      w7[2] = hc_bytealign (w4[0], w4[1], offset);
      w7[1] = hc_bytealign (w3[3], w4[0], offset);
      w7[0] = hc_bytealign (w3[2], w3[3], offset);
      w6[3] = hc_bytealign (w3[1], w3[2], offset);
      w6[2] = hc_bytealign (w3[0], w3[1], offset);
      w6[1] = hc_bytealign (w2[3], w3[0], offset);
      w6[0] = hc_bytealign (w2[2], w2[3], offset);
      w5[3] = hc_bytealign (w2[1], w2[2], offset);
      w5[2] = hc_bytealign (w2[0], w2[1], offset);
      w5[1] = hc_bytealign (w1[3], w2[0], offset);
      w5[0] = hc_bytealign (w1[2], w1[3], offset);
      w4[3] = hc_bytealign (w1[1], w1[2], offset);
      w4[2] = hc_bytealign (w1[0], w1[1], offset);
      w4[1] = hc_bytealign (w0[3], w1[0], offset);
      w4[0] = hc_bytealign (w0[2], w0[3], offset);
      w3[3] = hc_bytealign (w0[1], w0[2], offset);
      w3[2] = hc_bytealign (w0[0], w0[1], offset);
      w3[1] = hc_bytealign (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign (w7[3],     0, offset);
      c3[1] = hc_bytealign (w7[2], w7[3], offset);
      c3[0] = hc_bytealign (w7[1], w7[2], offset);
      c2[3] = hc_bytealign (w7[0], w7[1], offset);
      c2[2] = hc_bytealign (w6[3], w7[0], offset);
      c2[1] = hc_bytealign (w6[2], w6[3], offset);
      c2[0] = hc_bytealign (w6[1], w6[2], offset);
      c1[3] = hc_bytealign (w6[0], w6[1], offset);
      c1[2] = hc_bytealign (w5[3], w6[0], offset);
      c1[1] = hc_bytealign (w5[2], w5[3], offset);
      c1[0] = hc_bytealign (w5[1], w5[2], offset);
      c0[3] = hc_bytealign (w5[0], w5[1], offset);
      c0[2] = hc_bytealign (w4[3], w5[0], offset);
      c0[1] = hc_bytealign (w4[2], w4[3], offset);
      c0[0] = hc_bytealign (w4[1], w4[2], offset);
      w7[3] = hc_bytealign (w4[0], w4[1], offset);
      w7[2] = hc_bytealign (w3[3], w4[0], offset);
      w7[1] = hc_bytealign (w3[2], w3[3], offset);
      w7[0] = hc_bytealign (w3[1], w3[2], offset);
      w6[3] = hc_bytealign (w3[0], w3[1], offset);
      w6[2] = hc_bytealign (w2[3], w3[0], offset);
      w6[1] = hc_bytealign (w2[2], w2[3], offset);
      w6[0] = hc_bytealign (w2[1], w2[2], offset);
      w5[3] = hc_bytealign (w2[0], w2[1], offset);
      w5[2] = hc_bytealign (w1[3], w2[0], offset);
      w5[1] = hc_bytealign (w1[2], w1[3], offset);
      w5[0] = hc_bytealign (w1[1], w1[2], offset);
      w4[3] = hc_bytealign (w1[0], w1[1], offset);
      w4[2] = hc_bytealign (w0[3], w1[0], offset);
      w4[1] = hc_bytealign (w0[2], w0[3], offset);
      w4[0] = hc_bytealign (w0[1], w0[2], offset);
      w3[3] = hc_bytealign (w0[0], w0[1], offset);
      w3[2] = hc_bytealign (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign (w7[3],     0, offset);
      c3[2] = hc_bytealign (w7[2], w7[3], offset);
      c3[1] = hc_bytealign (w7[1], w7[2], offset);
      c3[0] = hc_bytealign (w7[0], w7[1], offset);
      c2[3] = hc_bytealign (w6[3], w7[0], offset);
      c2[2] = hc_bytealign (w6[2], w6[3], offset);
      c2[1] = hc_bytealign (w6[1], w6[2], offset);
      c2[0] = hc_bytealign (w6[0], w6[1], offset);
      c1[3] = hc_bytealign (w5[3], w6[0], offset);
      c1[2] = hc_bytealign (w5[2], w5[3], offset);
      c1[1] = hc_bytealign (w5[1], w5[2], offset);
      c1[0] = hc_bytealign (w5[0], w5[1], offset);
      c0[3] = hc_bytealign (w4[3], w5[0], offset);
      c0[2] = hc_bytealign (w4[2], w4[3], offset);
      c0[1] = hc_bytealign (w4[1], w4[2], offset);
      c0[0] = hc_bytealign (w4[0], w4[1], offset);
      w7[3] = hc_bytealign (w3[3], w4[0], offset);
      w7[2] = hc_bytealign (w3[2], w3[3], offset);
      w7[1] = hc_bytealign (w3[1], w3[2], offset);
      w7[0] = hc_bytealign (w3[0], w3[1], offset);
      w6[3] = hc_bytealign (w2[3], w3[0], offset);
      w6[2] = hc_bytealign (w2[2], w2[3], offset);
      w6[1] = hc_bytealign (w2[1], w2[2], offset);
      w6[0] = hc_bytealign (w2[0], w2[1], offset);
      w5[3] = hc_bytealign (w1[3], w2[0], offset);
      w5[2] = hc_bytealign (w1[2], w1[3], offset);
      w5[1] = hc_bytealign (w1[1], w1[2], offset);
      w5[0] = hc_bytealign (w1[0], w1[1], offset);
      w4[3] = hc_bytealign (w0[3], w1[0], offset);
      w4[2] = hc_bytealign (w0[2], w0[3], offset);
      w4[1] = hc_bytealign (w0[1], w0[2], offset);
      w4[0] = hc_bytealign (w0[0], w0[1], offset);
      w3[3] = hc_bytealign (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_bytealign (w7[3],     0, offset);
      c3[3] = hc_bytealign (w7[2], w7[3], offset);
      c3[2] = hc_bytealign (w7[1], w7[2], offset);
      c3[1] = hc_bytealign (w7[0], w7[1], offset);
      c3[0] = hc_bytealign (w6[3], w7[0], offset);
      c2[3] = hc_bytealign (w6[2], w6[3], offset);
      c2[2] = hc_bytealign (w6[1], w6[2], offset);
      c2[1] = hc_bytealign (w6[0], w6[1], offset);
      c2[0] = hc_bytealign (w5[3], w6[0], offset);
      c1[3] = hc_bytealign (w5[2], w5[3], offset);
      c1[2] = hc_bytealign (w5[1], w5[2], offset);
      c1[1] = hc_bytealign (w5[0], w5[1], offset);
      c1[0] = hc_bytealign (w4[3], w5[0], offset);
      c0[3] = hc_bytealign (w4[2], w4[3], offset);
      c0[2] = hc_bytealign (w4[1], w4[2], offset);
      c0[1] = hc_bytealign (w4[0], w4[1], offset);
      c0[0] = hc_bytealign (w3[3], w4[0], offset);
      w7[3] = hc_bytealign (w3[2], w3[3], offset);
      w7[2] = hc_bytealign (w3[1], w3[2], offset);
      w7[1] = hc_bytealign (w3[0], w3[1], offset);
      w7[0] = hc_bytealign (w2[3], w3[0], offset);
      w6[3] = hc_bytealign (w2[2], w2[3], offset);
      w6[2] = hc_bytealign (w2[1], w2[2], offset);
      w6[1] = hc_bytealign (w2[0], w2[1], offset);
      w6[0] = hc_bytealign (w1[3], w2[0], offset);
      w5[3] = hc_bytealign (w1[2], w1[3], offset);
      w5[2] = hc_bytealign (w1[1], w1[2], offset);
      w5[1] = hc_bytealign (w1[0], w1[1], offset);
      w5[0] = hc_bytealign (w0[3], w1[0], offset);
      w4[3] = hc_bytealign (w0[2], w0[3], offset);
      w4[2] = hc_bytealign (w0[1], w0[2], offset);
      w4[1] = hc_bytealign (w0[0], w0[1], offset);
      w4[0] = hc_bytealign (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_bytealign (w7[3],     0, offset);
      c4[0] = hc_bytealign (w7[2], w7[3], offset);
      c3[3] = hc_bytealign (w7[1], w7[2], offset);
      c3[2] = hc_bytealign (w7[0], w7[1], offset);
      c3[1] = hc_bytealign (w6[3], w7[0], offset);
      c3[0] = hc_bytealign (w6[2], w6[3], offset);
      c2[3] = hc_bytealign (w6[1], w6[2], offset);
      c2[2] = hc_bytealign (w6[0], w6[1], offset);
      c2[1] = hc_bytealign (w5[3], w6[0], offset);
      c2[0] = hc_bytealign (w5[2], w5[3], offset);
      c1[3] = hc_bytealign (w5[1], w5[2], offset);
      c1[2] = hc_bytealign (w5[0], w5[1], offset);
      c1[1] = hc_bytealign (w4[3], w5[0], offset);
      c1[0] = hc_bytealign (w4[2], w4[3], offset);
      c0[3] = hc_bytealign (w4[1], w4[2], offset);
      c0[2] = hc_bytealign (w4[0], w4[1], offset);
      c0[1] = hc_bytealign (w3[3], w4[0], offset);
      c0[0] = hc_bytealign (w3[2], w3[3], offset);
      w7[3] = hc_bytealign (w3[1], w3[2], offset);
      w7[2] = hc_bytealign (w3[0], w3[1], offset);
      w7[1] = hc_bytealign (w2[3], w3[0], offset);
      w7[0] = hc_bytealign (w2[2], w2[3], offset);
      w6[3] = hc_bytealign (w2[1], w2[2], offset);
      w6[2] = hc_bytealign (w2[0], w2[1], offset);
      w6[1] = hc_bytealign (w1[3], w2[0], offset);
      w6[0] = hc_bytealign (w1[2], w1[3], offset);
      w5[3] = hc_bytealign (w1[1], w1[2], offset);
      w5[2] = hc_bytealign (w1[0], w1[1], offset);
      w5[1] = hc_bytealign (w0[3], w1[0], offset);
      w5[0] = hc_bytealign (w0[2], w0[3], offset);
      w4[3] = hc_bytealign (w0[1], w0[2], offset);
      w4[2] = hc_bytealign (w0[0], w0[1], offset);
      w4[1] = hc_bytealign (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_bytealign (w7[3],     0, offset);
      c4[1] = hc_bytealign (w7[2], w7[3], offset);
      c4[0] = hc_bytealign (w7[1], w7[2], offset);
      c3[3] = hc_bytealign (w7[0], w7[1], offset);
      c3[2] = hc_bytealign (w6[3], w7[0], offset);
      c3[1] = hc_bytealign (w6[2], w6[3], offset);
      c3[0] = hc_bytealign (w6[1], w6[2], offset);
      c2[3] = hc_bytealign (w6[0], w6[1], offset);
      c2[2] = hc_bytealign (w5[3], w6[0], offset);
      c2[1] = hc_bytealign (w5[2], w5[3], offset);
      c2[0] = hc_bytealign (w5[1], w5[2], offset);
      c1[3] = hc_bytealign (w5[0], w5[1], offset);
      c1[2] = hc_bytealign (w4[3], w5[0], offset);
      c1[1] = hc_bytealign (w4[2], w4[3], offset);
      c1[0] = hc_bytealign (w4[1], w4[2], offset);
      c0[3] = hc_bytealign (w4[0], w4[1], offset);
      c0[2] = hc_bytealign (w3[3], w4[0], offset);
      c0[1] = hc_bytealign (w3[2], w3[3], offset);
      c0[0] = hc_bytealign (w3[1], w3[2], offset);
      w7[3] = hc_bytealign (w3[0], w3[1], offset);
      w7[2] = hc_bytealign (w2[3], w3[0], offset);
      w7[1] = hc_bytealign (w2[2], w2[3], offset);
      w7[0] = hc_bytealign (w2[1], w2[2], offset);
      w6[3] = hc_bytealign (w2[0], w2[1], offset);
      w6[2] = hc_bytealign (w1[3], w2[0], offset);
      w6[1] = hc_bytealign (w1[2], w1[3], offset);
      w6[0] = hc_bytealign (w1[1], w1[2], offset);
      w5[3] = hc_bytealign (w1[0], w1[1], offset);
      w5[2] = hc_bytealign (w0[3], w1[0], offset);
      w5[1] = hc_bytealign (w0[2], w0[3], offset);
      w5[0] = hc_bytealign (w0[1], w0[2], offset);
      w4[3] = hc_bytealign (w0[0], w0[1], offset);
      w4[2] = hc_bytealign (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_bytealign (w7[3],     0, offset);
      c4[2] = hc_bytealign (w7[2], w7[3], offset);
      c4[1] = hc_bytealign (w7[1], w7[2], offset);
      c4[0] = hc_bytealign (w7[0], w7[1], offset);
      c3[3] = hc_bytealign (w6[3], w7[0], offset);
      c3[2] = hc_bytealign (w6[2], w6[3], offset);
      c3[1] = hc_bytealign (w6[1], w6[2], offset);
      c3[0] = hc_bytealign (w6[0], w6[1], offset);
      c2[3] = hc_bytealign (w5[3], w6[0], offset);
      c2[2] = hc_bytealign (w5[2], w5[3], offset);
      c2[1] = hc_bytealign (w5[1], w5[2], offset);
      c2[0] = hc_bytealign (w5[0], w5[1], offset);
      c1[3] = hc_bytealign (w4[3], w5[0], offset);
      c1[2] = hc_bytealign (w4[2], w4[3], offset);
      c1[1] = hc_bytealign (w4[1], w4[2], offset);
      c1[0] = hc_bytealign (w4[0], w4[1], offset);
      c0[3] = hc_bytealign (w3[3], w4[0], offset);
      c0[2] = hc_bytealign (w3[2], w3[3], offset);
      c0[1] = hc_bytealign (w3[1], w3[2], offset);
      c0[0] = hc_bytealign (w3[0], w3[1], offset);
      w7[3] = hc_bytealign (w2[3], w3[0], offset);
      w7[2] = hc_bytealign (w2[2], w2[3], offset);
      w7[1] = hc_bytealign (w2[1], w2[2], offset);
      w7[0] = hc_bytealign (w2[0], w2[1], offset);
      w6[3] = hc_bytealign (w1[3], w2[0], offset);
      w6[2] = hc_bytealign (w1[2], w1[3], offset);
      w6[1] = hc_bytealign (w1[1], w1[2], offset);
      w6[0] = hc_bytealign (w1[0], w1[1], offset);
      w5[3] = hc_bytealign (w0[3], w1[0], offset);
      w5[2] = hc_bytealign (w0[2], w0[3], offset);
      w5[1] = hc_bytealign (w0[1], w0[2], offset);
      w5[0] = hc_bytealign (w0[0], w0[1], offset);
      w4[3] = hc_bytealign (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_bytealign (w7[3],     0, offset);
      c4[3] = hc_bytealign (w7[2], w7[3], offset);
      c4[2] = hc_bytealign (w7[1], w7[2], offset);
      c4[1] = hc_bytealign (w7[0], w7[1], offset);
      c4[0] = hc_bytealign (w6[3], w7[0], offset);
      c3[3] = hc_bytealign (w6[2], w6[3], offset);
      c3[2] = hc_bytealign (w6[1], w6[2], offset);
      c3[1] = hc_bytealign (w6[0], w6[1], offset);
      c3[0] = hc_bytealign (w5[3], w6[0], offset);
      c2[3] = hc_bytealign (w5[2], w5[3], offset);
      c2[2] = hc_bytealign (w5[1], w5[2], offset);
      c2[1] = hc_bytealign (w5[0], w5[1], offset);
      c2[0] = hc_bytealign (w4[3], w5[0], offset);
      c1[3] = hc_bytealign (w4[2], w4[3], offset);
      c1[2] = hc_bytealign (w4[1], w4[2], offset);
      c1[1] = hc_bytealign (w4[0], w4[1], offset);
      c1[0] = hc_bytealign (w3[3], w4[0], offset);
      c0[3] = hc_bytealign (w3[2], w3[3], offset);
      c0[2] = hc_bytealign (w3[1], w3[2], offset);
      c0[1] = hc_bytealign (w3[0], w3[1], offset);
      c0[0] = hc_bytealign (w2[3], w3[0], offset);
      w7[3] = hc_bytealign (w2[2], w2[3], offset);
      w7[2] = hc_bytealign (w2[1], w2[2], offset);
      w7[1] = hc_bytealign (w2[0], w2[1], offset);
      w7[0] = hc_bytealign (w1[3], w2[0], offset);
      w6[3] = hc_bytealign (w1[2], w1[3], offset);
      w6[2] = hc_bytealign (w1[1], w1[2], offset);
      w6[1] = hc_bytealign (w1[0], w1[1], offset);
      w6[0] = hc_bytealign (w0[3], w1[0], offset);
      w5[3] = hc_bytealign (w0[2], w0[3], offset);
      w5[2] = hc_bytealign (w0[1], w0[2], offset);
      w5[1] = hc_bytealign (w0[0], w0[1], offset);
      w5[0] = hc_bytealign (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_bytealign (w7[3],     0, offset);
      c5[0] = hc_bytealign (w7[2], w7[3], offset);
      c4[3] = hc_bytealign (w7[1], w7[2], offset);
      c4[2] = hc_bytealign (w7[0], w7[1], offset);
      c4[1] = hc_bytealign (w6[3], w7[0], offset);
      c4[0] = hc_bytealign (w6[2], w6[3], offset);
      c3[3] = hc_bytealign (w6[1], w6[2], offset);
      c3[2] = hc_bytealign (w6[0], w6[1], offset);
      c3[1] = hc_bytealign (w5[3], w6[0], offset);
      c3[0] = hc_bytealign (w5[2], w5[3], offset);
      c2[3] = hc_bytealign (w5[1], w5[2], offset);
      c2[2] = hc_bytealign (w5[0], w5[1], offset);
      c2[1] = hc_bytealign (w4[3], w5[0], offset);
      c2[0] = hc_bytealign (w4[2], w4[3], offset);
      c1[3] = hc_bytealign (w4[1], w4[2], offset);
      c1[2] = hc_bytealign (w4[0], w4[1], offset);
      c1[1] = hc_bytealign (w3[3], w4[0], offset);
      c1[0] = hc_bytealign (w3[2], w3[3], offset);
      c0[3] = hc_bytealign (w3[1], w3[2], offset);
      c0[2] = hc_bytealign (w3[0], w3[1], offset);
      c0[1] = hc_bytealign (w2[3], w3[0], offset);
      c0[0] = hc_bytealign (w2[2], w2[3], offset);
      w7[3] = hc_bytealign (w2[1], w2[2], offset);
      w7[2] = hc_bytealign (w2[0], w2[1], offset);
      w7[1] = hc_bytealign (w1[3], w2[0], offset);
      w7[0] = hc_bytealign (w1[2], w1[3], offset);
      w6[3] = hc_bytealign (w1[1], w1[2], offset);
      w6[2] = hc_bytealign (w1[0], w1[1], offset);
      w6[1] = hc_bytealign (w0[3], w1[0], offset);
      w6[0] = hc_bytealign (w0[2], w0[3], offset);
      w5[3] = hc_bytealign (w0[1], w0[2], offset);
      w5[2] = hc_bytealign (w0[0], w0[1], offset);
      w5[1] = hc_bytealign (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_bytealign (w7[3],     0, offset);
      c5[1] = hc_bytealign (w7[2], w7[3], offset);
      c5[0] = hc_bytealign (w7[1], w7[2], offset);
      c4[3] = hc_bytealign (w7[0], w7[1], offset);
      c4[2] = hc_bytealign (w6[3], w7[0], offset);
      c4[1] = hc_bytealign (w6[2], w6[3], offset);
      c4[0] = hc_bytealign (w6[1], w6[2], offset);
      c3[3] = hc_bytealign (w6[0], w6[1], offset);
      c3[2] = hc_bytealign (w5[3], w6[0], offset);
      c3[1] = hc_bytealign (w5[2], w5[3], offset);
      c3[0] = hc_bytealign (w5[1], w5[2], offset);
      c2[3] = hc_bytealign (w5[0], w5[1], offset);
      c2[2] = hc_bytealign (w4[3], w5[0], offset);
      c2[1] = hc_bytealign (w4[2], w4[3], offset);
      c2[0] = hc_bytealign (w4[1], w4[2], offset);
      c1[3] = hc_bytealign (w4[0], w4[1], offset);
      c1[2] = hc_bytealign (w3[3], w4[0], offset);
      c1[1] = hc_bytealign (w3[2], w3[3], offset);
      c1[0] = hc_bytealign (w3[1], w3[2], offset);
      c0[3] = hc_bytealign (w3[0], w3[1], offset);
      c0[2] = hc_bytealign (w2[3], w3[0], offset);
      c0[1] = hc_bytealign (w2[2], w2[3], offset);
      c0[0] = hc_bytealign (w2[1], w2[2], offset);
      w7[3] = hc_bytealign (w2[0], w2[1], offset);
      w7[2] = hc_bytealign (w1[3], w2[0], offset);
      w7[1] = hc_bytealign (w1[2], w1[3], offset);
      w7[0] = hc_bytealign (w1[1], w1[2], offset);
      w6[3] = hc_bytealign (w1[0], w1[1], offset);
      w6[2] = hc_bytealign (w0[3], w1[0], offset);
      w6[1] = hc_bytealign (w0[2], w0[3], offset);
      w6[0] = hc_bytealign (w0[1], w0[2], offset);
      w5[3] = hc_bytealign (w0[0], w0[1], offset);
      w5[2] = hc_bytealign (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_bytealign (w7[3],     0, offset);
      c5[2] = hc_bytealign (w7[2], w7[3], offset);
      c5[1] = hc_bytealign (w7[1], w7[2], offset);
      c5[0] = hc_bytealign (w7[0], w7[1], offset);
      c4[3] = hc_bytealign (w6[3], w7[0], offset);
      c4[2] = hc_bytealign (w6[2], w6[3], offset);
      c4[1] = hc_bytealign (w6[1], w6[2], offset);
      c4[0] = hc_bytealign (w6[0], w6[1], offset);
      c3[3] = hc_bytealign (w5[3], w6[0], offset);
      c3[2] = hc_bytealign (w5[2], w5[3], offset);
      c3[1] = hc_bytealign (w5[1], w5[2], offset);
      c3[0] = hc_bytealign (w5[0], w5[1], offset);
      c2[3] = hc_bytealign (w4[3], w5[0], offset);
      c2[2] = hc_bytealign (w4[2], w4[3], offset);
      c2[1] = hc_bytealign (w4[1], w4[2], offset);
      c2[0] = hc_bytealign (w4[0], w4[1], offset);
      c1[3] = hc_bytealign (w3[3], w4[0], offset);
      c1[2] = hc_bytealign (w3[2], w3[3], offset);
      c1[1] = hc_bytealign (w3[1], w3[2], offset);
      c1[0] = hc_bytealign (w3[0], w3[1], offset);
      c0[3] = hc_bytealign (w2[3], w3[0], offset);
      c0[2] = hc_bytealign (w2[2], w2[3], offset);
      c0[1] = hc_bytealign (w2[1], w2[2], offset);
      c0[0] = hc_bytealign (w2[0], w2[1], offset);
      w7[3] = hc_bytealign (w1[3], w2[0], offset);
      w7[2] = hc_bytealign (w1[2], w1[3], offset);
      w7[1] = hc_bytealign (w1[1], w1[2], offset);
      w7[0] = hc_bytealign (w1[0], w1[1], offset);
      w6[3] = hc_bytealign (w0[3], w1[0], offset);
      w6[2] = hc_bytealign (w0[2], w0[3], offset);
      w6[1] = hc_bytealign (w0[1], w0[2], offset);
      w6[0] = hc_bytealign (w0[0], w0[1], offset);
      w5[3] = hc_bytealign (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_bytealign (w7[3],     0, offset);
      c5[3] = hc_bytealign (w7[2], w7[3], offset);
      c5[2] = hc_bytealign (w7[1], w7[2], offset);
      c5[1] = hc_bytealign (w7[0], w7[1], offset);
      c5[0] = hc_bytealign (w6[3], w7[0], offset);
      c4[3] = hc_bytealign (w6[2], w6[3], offset);
      c4[2] = hc_bytealign (w6[1], w6[2], offset);
      c4[1] = hc_bytealign (w6[0], w6[1], offset);
      c4[0] = hc_bytealign (w5[3], w6[0], offset);
      c3[3] = hc_bytealign (w5[2], w5[3], offset);
      c3[2] = hc_bytealign (w5[1], w5[2], offset);
      c3[1] = hc_bytealign (w5[0], w5[1], offset);
      c3[0] = hc_bytealign (w4[3], w5[0], offset);
      c2[3] = hc_bytealign (w4[2], w4[3], offset);
      c2[2] = hc_bytealign (w4[1], w4[2], offset);
      c2[1] = hc_bytealign (w4[0], w4[1], offset);
      c2[0] = hc_bytealign (w3[3], w4[0], offset);
      c1[3] = hc_bytealign (w3[2], w3[3], offset);
      c1[2] = hc_bytealign (w3[1], w3[2], offset);
      c1[1] = hc_bytealign (w3[0], w3[1], offset);
      c1[0] = hc_bytealign (w2[3], w3[0], offset);
      c0[3] = hc_bytealign (w2[2], w2[3], offset);
      c0[2] = hc_bytealign (w2[1], w2[2], offset);
      c0[1] = hc_bytealign (w2[0], w2[1], offset);
      c0[0] = hc_bytealign (w1[3], w2[0], offset);
      w7[3] = hc_bytealign (w1[2], w1[3], offset);
      w7[2] = hc_bytealign (w1[1], w1[2], offset);
      w7[1] = hc_bytealign (w1[0], w1[1], offset);
      w7[0] = hc_bytealign (w0[3], w1[0], offset);
      w6[3] = hc_bytealign (w0[2], w0[3], offset);
      w6[2] = hc_bytealign (w0[1], w0[2], offset);
      w6[1] = hc_bytealign (w0[0], w0[1], offset);
      w6[0] = hc_bytealign (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_bytealign (w7[3],     0, offset);
      c6[0] = hc_bytealign (w7[2], w7[3], offset);
      c5[3] = hc_bytealign (w7[1], w7[2], offset);
      c5[2] = hc_bytealign (w7[0], w7[1], offset);
      c5[1] = hc_bytealign (w6[3], w7[0], offset);
      c5[0] = hc_bytealign (w6[2], w6[3], offset);
      c4[3] = hc_bytealign (w6[1], w6[2], offset);
      c4[2] = hc_bytealign (w6[0], w6[1], offset);
      c4[1] = hc_bytealign (w5[3], w6[0], offset);
      c4[0] = hc_bytealign (w5[2], w5[3], offset);
      c3[3] = hc_bytealign (w5[1], w5[2], offset);
      c3[2] = hc_bytealign (w5[0], w5[1], offset);
      c3[1] = hc_bytealign (w4[3], w5[0], offset);
      c3[0] = hc_bytealign (w4[2], w4[3], offset);
      c2[3] = hc_bytealign (w4[1], w4[2], offset);
      c2[2] = hc_bytealign (w4[0], w4[1], offset);
      c2[1] = hc_bytealign (w3[3], w4[0], offset);
      c2[0] = hc_bytealign (w3[2], w3[3], offset);
      c1[3] = hc_bytealign (w3[1], w3[2], offset);
      c1[2] = hc_bytealign (w3[0], w3[1], offset);
      c1[1] = hc_bytealign (w2[3], w3[0], offset);
      c1[0] = hc_bytealign (w2[2], w2[3], offset);
      c0[3] = hc_bytealign (w2[1], w2[2], offset);
      c0[2] = hc_bytealign (w2[0], w2[1], offset);
      c0[1] = hc_bytealign (w1[3], w2[0], offset);
      c0[0] = hc_bytealign (w1[2], w1[3], offset);
      w7[3] = hc_bytealign (w1[1], w1[2], offset);
      w7[2] = hc_bytealign (w1[0], w1[1], offset);
      w7[1] = hc_bytealign (w0[3], w1[0], offset);
      w7[0] = hc_bytealign (w0[2], w0[3], offset);
      w6[3] = hc_bytealign (w0[1], w0[2], offset);
      w6[2] = hc_bytealign (w0[0], w0[1], offset);
      w6[1] = hc_bytealign (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_bytealign (w7[3],     0, offset);
      c6[1] = hc_bytealign (w7[2], w7[3], offset);
      c6[0] = hc_bytealign (w7[1], w7[2], offset);
      c5[3] = hc_bytealign (w7[0], w7[1], offset);
      c5[2] = hc_bytealign (w6[3], w7[0], offset);
      c5[1] = hc_bytealign (w6[2], w6[3], offset);
      c5[0] = hc_bytealign (w6[1], w6[2], offset);
      c4[3] = hc_bytealign (w6[0], w6[1], offset);
      c4[2] = hc_bytealign (w5[3], w6[0], offset);
      c4[1] = hc_bytealign (w5[2], w5[3], offset);
      c4[0] = hc_bytealign (w5[1], w5[2], offset);
      c3[3] = hc_bytealign (w5[0], w5[1], offset);
      c3[2] = hc_bytealign (w4[3], w5[0], offset);
      c3[1] = hc_bytealign (w4[2], w4[3], offset);
      c3[0] = hc_bytealign (w4[1], w4[2], offset);
      c2[3] = hc_bytealign (w4[0], w4[1], offset);
      c2[2] = hc_bytealign (w3[3], w4[0], offset);
      c2[1] = hc_bytealign (w3[2], w3[3], offset);
      c2[0] = hc_bytealign (w3[1], w3[2], offset);
      c1[3] = hc_bytealign (w3[0], w3[1], offset);
      c1[2] = hc_bytealign (w2[3], w3[0], offset);
      c1[1] = hc_bytealign (w2[2], w2[3], offset);
      c1[0] = hc_bytealign (w2[1], w2[2], offset);
      c0[3] = hc_bytealign (w2[0], w2[1], offset);
      c0[2] = hc_bytealign (w1[3], w2[0], offset);
      c0[1] = hc_bytealign (w1[2], w1[3], offset);
      c0[0] = hc_bytealign (w1[1], w1[2], offset);
      w7[3] = hc_bytealign (w1[0], w1[1], offset);
      w7[2] = hc_bytealign (w0[3], w1[0], offset);
      w7[1] = hc_bytealign (w0[2], w0[3], offset);
      w7[0] = hc_bytealign (w0[1], w0[2], offset);
      w6[3] = hc_bytealign (w0[0], w0[1], offset);
      w6[2] = hc_bytealign (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_bytealign (w7[3],     0, offset);
      c6[2] = hc_bytealign (w7[2], w7[3], offset);
      c6[1] = hc_bytealign (w7[1], w7[2], offset);
      c6[0] = hc_bytealign (w7[0], w7[1], offset);
      c5[3] = hc_bytealign (w6[3], w7[0], offset);
      c5[2] = hc_bytealign (w6[2], w6[3], offset);
      c5[1] = hc_bytealign (w6[1], w6[2], offset);
      c5[0] = hc_bytealign (w6[0], w6[1], offset);
      c4[3] = hc_bytealign (w5[3], w6[0], offset);
      c4[2] = hc_bytealign (w5[2], w5[3], offset);
      c4[1] = hc_bytealign (w5[1], w5[2], offset);
      c4[0] = hc_bytealign (w5[0], w5[1], offset);
      c3[3] = hc_bytealign (w4[3], w5[0], offset);
      c3[2] = hc_bytealign (w4[2], w4[3], offset);
      c3[1] = hc_bytealign (w4[1], w4[2], offset);
      c3[0] = hc_bytealign (w4[0], w4[1], offset);
      c2[3] = hc_bytealign (w3[3], w4[0], offset);
      c2[2] = hc_bytealign (w3[2], w3[3], offset);
      c2[1] = hc_bytealign (w3[1], w3[2], offset);
      c2[0] = hc_bytealign (w3[0], w3[1], offset);
      c1[3] = hc_bytealign (w2[3], w3[0], offset);
      c1[2] = hc_bytealign (w2[2], w2[3], offset);
      c1[1] = hc_bytealign (w2[1], w2[2], offset);
      c1[0] = hc_bytealign (w2[0], w2[1], offset);
      c0[3] = hc_bytealign (w1[3], w2[0], offset);
      c0[2] = hc_bytealign (w1[2], w1[3], offset);
      c0[1] = hc_bytealign (w1[1], w1[2], offset);
      c0[0] = hc_bytealign (w1[0], w1[1], offset);
      w7[3] = hc_bytealign (w0[3], w1[0], offset);
      w7[2] = hc_bytealign (w0[2], w0[3], offset);
      w7[1] = hc_bytealign (w0[1], w0[2], offset);
      w7[0] = hc_bytealign (w0[0], w0[1], offset);
      w6[3] = hc_bytealign (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_bytealign (w7[3],     0, offset);
      c6[3] = hc_bytealign (w7[2], w7[3], offset);
      c6[2] = hc_bytealign (w7[1], w7[2], offset);
      c6[1] = hc_bytealign (w7[0], w7[1], offset);
      c6[0] = hc_bytealign (w6[3], w7[0], offset);
      c5[3] = hc_bytealign (w6[2], w6[3], offset);
      c5[2] = hc_bytealign (w6[1], w6[2], offset);
      c5[1] = hc_bytealign (w6[0], w6[1], offset);
      c5[0] = hc_bytealign (w5[3], w6[0], offset);
      c4[3] = hc_bytealign (w5[2], w5[3], offset);
      c4[2] = hc_bytealign (w5[1], w5[2], offset);
      c4[1] = hc_bytealign (w5[0], w5[1], offset);
      c4[0] = hc_bytealign (w4[3], w5[0], offset);
      c3[3] = hc_bytealign (w4[2], w4[3], offset);
      c3[2] = hc_bytealign (w4[1], w4[2], offset);
      c3[1] = hc_bytealign (w4[0], w4[1], offset);
      c3[0] = hc_bytealign (w3[3], w4[0], offset);
      c2[3] = hc_bytealign (w3[2], w3[3], offset);
      c2[2] = hc_bytealign (w3[1], w3[2], offset);
      c2[1] = hc_bytealign (w3[0], w3[1], offset);
      c2[0] = hc_bytealign (w2[3], w3[0], offset);
      c1[3] = hc_bytealign (w2[2], w2[3], offset);
      c1[2] = hc_bytealign (w2[1], w2[2], offset);
      c1[1] = hc_bytealign (w2[0], w2[1], offset);
      c1[0] = hc_bytealign (w1[3], w2[0], offset);
      c0[3] = hc_bytealign (w1[2], w1[3], offset);
      c0[2] = hc_bytealign (w1[1], w1[2], offset);
      c0[1] = hc_bytealign (w1[0], w1[1], offset);
      c0[0] = hc_bytealign (w0[3], w1[0], offset);
      w7[3] = hc_bytealign (w0[2], w0[3], offset);
      w7[2] = hc_bytealign (w0[1], w0[2], offset);
      w7[1] = hc_bytealign (w0[0], w0[1], offset);
      w7[0] = hc_bytealign (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_bytealign (w7[3],     0, offset);
      c7[0] = hc_bytealign (w7[2], w7[3], offset);
      c6[3] = hc_bytealign (w7[1], w7[2], offset);
      c6[2] = hc_bytealign (w7[0], w7[1], offset);
      c6[1] = hc_bytealign (w6[3], w7[0], offset);
      c6[0] = hc_bytealign (w6[2], w6[3], offset);
      c5[3] = hc_bytealign (w6[1], w6[2], offset);
      c5[2] = hc_bytealign (w6[0], w6[1], offset);
      c5[1] = hc_bytealign (w5[3], w6[0], offset);
      c5[0] = hc_bytealign (w5[2], w5[3], offset);
      c4[3] = hc_bytealign (w5[1], w5[2], offset);
      c4[2] = hc_bytealign (w5[0], w5[1], offset);
      c4[1] = hc_bytealign (w4[3], w5[0], offset);
      c4[0] = hc_bytealign (w4[2], w4[3], offset);
      c3[3] = hc_bytealign (w4[1], w4[2], offset);
      c3[2] = hc_bytealign (w4[0], w4[1], offset);
      c3[1] = hc_bytealign (w3[3], w4[0], offset);
      c3[0] = hc_bytealign (w3[2], w3[3], offset);
      c2[3] = hc_bytealign (w3[1], w3[2], offset);
      c2[2] = hc_bytealign (w3[0], w3[1], offset);
      c2[1] = hc_bytealign (w2[3], w3[0], offset);
      c2[0] = hc_bytealign (w2[2], w2[3], offset);
      c1[3] = hc_bytealign (w2[1], w2[2], offset);
      c1[2] = hc_bytealign (w2[0], w2[1], offset);
      c1[1] = hc_bytealign (w1[3], w2[0], offset);
      c1[0] = hc_bytealign (w1[2], w1[3], offset);
      c0[3] = hc_bytealign (w1[1], w1[2], offset);
      c0[2] = hc_bytealign (w1[0], w1[1], offset);
      c0[1] = hc_bytealign (w0[3], w1[0], offset);
      c0[0] = hc_bytealign (w0[2], w0[3], offset);
      w7[3] = hc_bytealign (w0[1], w0[2], offset);
      w7[2] = hc_bytealign (w0[0], w0[1], offset);
      w7[1] = hc_bytealign (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_bytealign (w7[3],     0, offset);
      c7[1] = hc_bytealign (w7[2], w7[3], offset);
      c7[0] = hc_bytealign (w7[1], w7[2], offset);
      c6[3] = hc_bytealign (w7[0], w7[1], offset);
      c6[2] = hc_bytealign (w6[3], w7[0], offset);
      c6[1] = hc_bytealign (w6[2], w6[3], offset);
      c6[0] = hc_bytealign (w6[1], w6[2], offset);
      c5[3] = hc_bytealign (w6[0], w6[1], offset);
      c5[2] = hc_bytealign (w5[3], w6[0], offset);
      c5[1] = hc_bytealign (w5[2], w5[3], offset);
      c5[0] = hc_bytealign (w5[1], w5[2], offset);
      c4[3] = hc_bytealign (w5[0], w5[1], offset);
      c4[2] = hc_bytealign (w4[3], w5[0], offset);
      c4[1] = hc_bytealign (w4[2], w4[3], offset);
      c4[0] = hc_bytealign (w4[1], w4[2], offset);
      c3[3] = hc_bytealign (w4[0], w4[1], offset);
      c3[2] = hc_bytealign (w3[3], w4[0], offset);
      c3[1] = hc_bytealign (w3[2], w3[3], offset);
      c3[0] = hc_bytealign (w3[1], w3[2], offset);
      c2[3] = hc_bytealign (w3[0], w3[1], offset);
      c2[2] = hc_bytealign (w2[3], w3[0], offset);
      c2[1] = hc_bytealign (w2[2], w2[3], offset);
      c2[0] = hc_bytealign (w2[1], w2[2], offset);
      c1[3] = hc_bytealign (w2[0], w2[1], offset);
      c1[2] = hc_bytealign (w1[3], w2[0], offset);
      c1[1] = hc_bytealign (w1[2], w1[3], offset);
      c1[0] = hc_bytealign (w1[1], w1[2], offset);
      c0[3] = hc_bytealign (w1[0], w1[1], offset);
      c0[2] = hc_bytealign (w0[3], w1[0], offset);
      c0[1] = hc_bytealign (w0[2], w0[3], offset);
      c0[0] = hc_bytealign (w0[1], w0[2], offset);
      w7[3] = hc_bytealign (w0[0], w0[1], offset);
      w7[2] = hc_bytealign (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_bytealign (w7[3],     0, offset);
      c7[2] = hc_bytealign (w7[2], w7[3], offset);
      c7[1] = hc_bytealign (w7[1], w7[2], offset);
      c7[0] = hc_bytealign (w7[0], w7[1], offset);
      c6[3] = hc_bytealign (w6[3], w7[0], offset);
      c6[2] = hc_bytealign (w6[2], w6[3], offset);
      c6[1] = hc_bytealign (w6[1], w6[2], offset);
      c6[0] = hc_bytealign (w6[0], w6[1], offset);
      c5[3] = hc_bytealign (w5[3], w6[0], offset);
      c5[2] = hc_bytealign (w5[2], w5[3], offset);
      c5[1] = hc_bytealign (w5[1], w5[2], offset);
      c5[0] = hc_bytealign (w5[0], w5[1], offset);
      c4[3] = hc_bytealign (w4[3], w5[0], offset);
      c4[2] = hc_bytealign (w4[2], w4[3], offset);
      c4[1] = hc_bytealign (w4[1], w4[2], offset);
      c4[0] = hc_bytealign (w4[0], w4[1], offset);
      c3[3] = hc_bytealign (w3[3], w4[0], offset);
      c3[2] = hc_bytealign (w3[2], w3[3], offset);
      c3[1] = hc_bytealign (w3[1], w3[2], offset);
      c3[0] = hc_bytealign (w3[0], w3[1], offset);
      c2[3] = hc_bytealign (w2[3], w3[0], offset);
      c2[2] = hc_bytealign (w2[2], w2[3], offset);
      c2[1] = hc_bytealign (w2[1], w2[2], offset);
      c2[0] = hc_bytealign (w2[0], w2[1], offset);
      c1[3] = hc_bytealign (w1[3], w2[0], offset);
      c1[2] = hc_bytealign (w1[2], w1[3], offset);
      c1[1] = hc_bytealign (w1[1], w1[2], offset);
      c1[0] = hc_bytealign (w1[0], w1[1], offset);
      c0[3] = hc_bytealign (w0[3], w1[0], offset);
      c0[2] = hc_bytealign (w0[2], w0[3], offset);
      c0[1] = hc_bytealign (w0[1], w0[2], offset);
      c0[0] = hc_bytealign (w0[0], w0[1], offset);
      w7[3] = hc_bytealign (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm (w7[3],     0, selector);
      w7[3] = hc_byte_perm (w7[2], w7[3], selector);
      w7[2] = hc_byte_perm (w7[1], w7[2], selector);
      w7[1] = hc_byte_perm (w7[0], w7[1], selector);
      w7[0] = hc_byte_perm (w6[3], w7[0], selector);
      w6[3] = hc_byte_perm (w6[2], w6[3], selector);
      w6[2] = hc_byte_perm (w6[1], w6[2], selector);
      w6[1] = hc_byte_perm (w6[0], w6[1], selector);
      w6[0] = hc_byte_perm (w5[3], w6[0], selector);
      w5[3] = hc_byte_perm (w5[2], w5[3], selector);
      w5[2] = hc_byte_perm (w5[1], w5[2], selector);
      w5[1] = hc_byte_perm (w5[0], w5[1], selector);
      w5[0] = hc_byte_perm (w4[3], w5[0], selector);
      w4[3] = hc_byte_perm (w4[2], w4[3], selector);
      w4[2] = hc_byte_perm (w4[1], w4[2], selector);
      w4[1] = hc_byte_perm (w4[0], w4[1], selector);
      w4[0] = hc_byte_perm (w3[3], w4[0], selector);
      w3[3] = hc_byte_perm (w3[2], w3[3], selector);
      w3[2] = hc_byte_perm (w3[1], w3[2], selector);
      w3[1] = hc_byte_perm (w3[0], w3[1], selector);
      w3[0] = hc_byte_perm (w2[3], w3[0], selector);
      w2[3] = hc_byte_perm (w2[2], w2[3], selector);
      w2[2] = hc_byte_perm (w2[1], w2[2], selector);
      w2[1] = hc_byte_perm (w2[0], w2[1], selector);
      w2[0] = hc_byte_perm (w1[3], w2[0], selector);
      w1[3] = hc_byte_perm (w1[2], w1[3], selector);
      w1[2] = hc_byte_perm (w1[1], w1[2], selector);
      w1[1] = hc_byte_perm (w1[0], w1[1], selector);
      w1[0] = hc_byte_perm (w0[3], w1[0], selector);
      w0[3] = hc_byte_perm (w0[2], w0[3], selector);
      w0[2] = hc_byte_perm (w0[1], w0[2], selector);
      w0[1] = hc_byte_perm (w0[0], w0[1], selector);
      w0[0] = hc_byte_perm (    0, w0[0], selector);

      break;

    case  1:
      c0[1] = hc_byte_perm (w7[3],     0, selector);
      c0[0] = hc_byte_perm (w7[2], w7[3], selector);
      w7[3] = hc_byte_perm (w7[1], w7[2], selector);
      w7[2] = hc_byte_perm (w7[0], w7[1], selector);
      w7[1] = hc_byte_perm (w6[3], w7[0], selector);
      w7[0] = hc_byte_perm (w6[2], w6[3], selector);
      w6[3] = hc_byte_perm (w6[1], w6[2], selector);
      w6[2] = hc_byte_perm (w6[0], w6[1], selector);
      w6[1] = hc_byte_perm (w5[3], w6[0], selector);
      w6[0] = hc_byte_perm (w5[2], w5[3], selector);
      w5[3] = hc_byte_perm (w5[1], w5[2], selector);
      w5[2] = hc_byte_perm (w5[0], w5[1], selector);
      w5[1] = hc_byte_perm (w4[3], w5[0], selector);
      w5[0] = hc_byte_perm (w4[2], w4[3], selector);
      w4[3] = hc_byte_perm (w4[1], w4[2], selector);
      w4[2] = hc_byte_perm (w4[0], w4[1], selector);
      w4[1] = hc_byte_perm (w3[3], w4[0], selector);
      w4[0] = hc_byte_perm (w3[2], w3[3], selector);
      w3[3] = hc_byte_perm (w3[1], w3[2], selector);
      w3[2] = hc_byte_perm (w3[0], w3[1], selector);
      w3[1] = hc_byte_perm (w2[3], w3[0], selector);
      w3[0] = hc_byte_perm (w2[2], w2[3], selector);
      w2[3] = hc_byte_perm (w2[1], w2[2], selector);
      w2[2] = hc_byte_perm (w2[0], w2[1], selector);
      w2[1] = hc_byte_perm (w1[3], w2[0], selector);
      w2[0] = hc_byte_perm (w1[2], w1[3], selector);
      w1[3] = hc_byte_perm (w1[1], w1[2], selector);
      w1[2] = hc_byte_perm (w1[0], w1[1], selector);
      w1[1] = hc_byte_perm (w0[3], w1[0], selector);
      w1[0] = hc_byte_perm (w0[2], w0[3], selector);
      w0[3] = hc_byte_perm (w0[1], w0[2], selector);
      w0[2] = hc_byte_perm (w0[0], w0[1], selector);
      w0[1] = hc_byte_perm (    0, w0[0], selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm (w7[3],     0, selector);
      c0[1] = hc_byte_perm (w7[2], w7[3], selector);
      c0[0] = hc_byte_perm (w7[1], w7[2], selector);
      w7[3] = hc_byte_perm (w7[0], w7[1], selector);
      w7[2] = hc_byte_perm (w6[3], w7[0], selector);
      w7[1] = hc_byte_perm (w6[2], w6[3], selector);
      w7[0] = hc_byte_perm (w6[1], w6[2], selector);
      w6[3] = hc_byte_perm (w6[0], w6[1], selector);
      w6[2] = hc_byte_perm (w5[3], w6[0], selector);
      w6[1] = hc_byte_perm (w5[2], w5[3], selector);
      w6[0] = hc_byte_perm (w5[1], w5[2], selector);
      w5[3] = hc_byte_perm (w5[0], w5[1], selector);
      w5[2] = hc_byte_perm (w4[3], w5[0], selector);
      w5[1] = hc_byte_perm (w4[2], w4[3], selector);
      w5[0] = hc_byte_perm (w4[1], w4[2], selector);
      w4[3] = hc_byte_perm (w4[0], w4[1], selector);
      w4[2] = hc_byte_perm (w3[3], w4[0], selector);
      w4[1] = hc_byte_perm (w3[2], w3[3], selector);
      w4[0] = hc_byte_perm (w3[1], w3[2], selector);
      w3[3] = hc_byte_perm (w3[0], w3[1], selector);
      w3[2] = hc_byte_perm (w2[3], w3[0], selector);
      w3[1] = hc_byte_perm (w2[2], w2[3], selector);
      w3[0] = hc_byte_perm (w2[1], w2[2], selector);
      w2[3] = hc_byte_perm (w2[0], w2[1], selector);
      w2[2] = hc_byte_perm (w1[3], w2[0], selector);
      w2[1] = hc_byte_perm (w1[2], w1[3], selector);
      w2[0] = hc_byte_perm (w1[1], w1[2], selector);
      w1[3] = hc_byte_perm (w1[0], w1[1], selector);
      w1[2] = hc_byte_perm (w0[3], w1[0], selector);
      w1[1] = hc_byte_perm (w0[2], w0[3], selector);
      w1[0] = hc_byte_perm (w0[1], w0[2], selector);
      w0[3] = hc_byte_perm (w0[0], w0[1], selector);
      w0[2] = hc_byte_perm (    0, w0[0], selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm (w7[3],     0, selector);
      c0[2] = hc_byte_perm (w7[2], w7[3], selector);
      c0[1] = hc_byte_perm (w7[1], w7[2], selector);
      c0[0] = hc_byte_perm (w7[0], w7[1], selector);
      w7[3] = hc_byte_perm (w6[3], w7[0], selector);
      w7[2] = hc_byte_perm (w6[2], w6[3], selector);
      w7[1] = hc_byte_perm (w6[1], w6[2], selector);
      w7[0] = hc_byte_perm (w6[0], w6[1], selector);
      w6[3] = hc_byte_perm (w5[3], w6[0], selector);
      w6[2] = hc_byte_perm (w5[2], w5[3], selector);
      w6[1] = hc_byte_perm (w5[1], w5[2], selector);
      w6[0] = hc_byte_perm (w5[0], w5[1], selector);
      w5[3] = hc_byte_perm (w4[3], w5[0], selector);
      w5[2] = hc_byte_perm (w4[2], w4[3], selector);
      w5[1] = hc_byte_perm (w4[1], w4[2], selector);
      w5[0] = hc_byte_perm (w4[0], w4[1], selector);
      w4[3] = hc_byte_perm (w3[3], w4[0], selector);
      w4[2] = hc_byte_perm (w3[2], w3[3], selector);
      w4[1] = hc_byte_perm (w3[1], w3[2], selector);
      w4[0] = hc_byte_perm (w3[0], w3[1], selector);
      w3[3] = hc_byte_perm (w2[3], w3[0], selector);
      w3[2] = hc_byte_perm (w2[2], w2[3], selector);
      w3[1] = hc_byte_perm (w2[1], w2[2], selector);
      w3[0] = hc_byte_perm (w2[0], w2[1], selector);
      w2[3] = hc_byte_perm (w1[3], w2[0], selector);
      w2[2] = hc_byte_perm (w1[2], w1[3], selector);
      w2[1] = hc_byte_perm (w1[1], w1[2], selector);
      w2[0] = hc_byte_perm (w1[0], w1[1], selector);
      w1[3] = hc_byte_perm (w0[3], w1[0], selector);
      w1[2] = hc_byte_perm (w0[2], w0[3], selector);
      w1[1] = hc_byte_perm (w0[1], w0[2], selector);
      w1[0] = hc_byte_perm (w0[0], w0[1], selector);
      w0[3] = hc_byte_perm (    0, w0[0], selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm (w7[3],     0, selector);
      c0[3] = hc_byte_perm (w7[2], w7[3], selector);
      c0[2] = hc_byte_perm (w7[1], w7[2], selector);
      c0[1] = hc_byte_perm (w7[0], w7[1], selector);
      c0[0] = hc_byte_perm (w6[3], w7[0], selector);
      w7[3] = hc_byte_perm (w6[2], w6[3], selector);
      w7[2] = hc_byte_perm (w6[1], w6[2], selector);
      w7[1] = hc_byte_perm (w6[0], w6[1], selector);
      w7[0] = hc_byte_perm (w5[3], w6[0], selector);
      w6[3] = hc_byte_perm (w5[2], w5[3], selector);
      w6[2] = hc_byte_perm (w5[1], w5[2], selector);
      w6[1] = hc_byte_perm (w5[0], w5[1], selector);
      w6[0] = hc_byte_perm (w4[3], w5[0], selector);
      w5[3] = hc_byte_perm (w4[2], w4[3], selector);
      w5[2] = hc_byte_perm (w4[1], w4[2], selector);
      w5[1] = hc_byte_perm (w4[0], w4[1], selector);
      w5[0] = hc_byte_perm (w3[3], w4[0], selector);
      w4[3] = hc_byte_perm (w3[2], w3[3], selector);
      w4[2] = hc_byte_perm (w3[1], w3[2], selector);
      w4[1] = hc_byte_perm (w3[0], w3[1], selector);
      w4[0] = hc_byte_perm (w2[3], w3[0], selector);
      w3[3] = hc_byte_perm (w2[2], w2[3], selector);
      w3[2] = hc_byte_perm (w2[1], w2[2], selector);
      w3[1] = hc_byte_perm (w2[0], w2[1], selector);
      w3[0] = hc_byte_perm (w1[3], w2[0], selector);
      w2[3] = hc_byte_perm (w1[2], w1[3], selector);
      w2[2] = hc_byte_perm (w1[1], w1[2], selector);
      w2[1] = hc_byte_perm (w1[0], w1[1], selector);
      w2[0] = hc_byte_perm (w0[3], w1[0], selector);
      w1[3] = hc_byte_perm (w0[2], w0[3], selector);
      w1[2] = hc_byte_perm (w0[1], w0[2], selector);
      w1[1] = hc_byte_perm (w0[0], w0[1], selector);
      w1[0] = hc_byte_perm (    0, w0[0], selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm (w7[3],     0, selector);
      c1[0] = hc_byte_perm (w7[2], w7[3], selector);
      c0[3] = hc_byte_perm (w7[1], w7[2], selector);
      c0[2] = hc_byte_perm (w7[0], w7[1], selector);
      c0[1] = hc_byte_perm (w6[3], w7[0], selector);
      c0[0] = hc_byte_perm (w6[2], w6[3], selector);
      w7[3] = hc_byte_perm (w6[1], w6[2], selector);
      w7[2] = hc_byte_perm (w6[0], w6[1], selector);
      w7[1] = hc_byte_perm (w5[3], w6[0], selector);
      w7[0] = hc_byte_perm (w5[2], w5[3], selector);
      w6[3] = hc_byte_perm (w5[1], w5[2], selector);
      w6[2] = hc_byte_perm (w5[0], w5[1], selector);
      w6[1] = hc_byte_perm (w4[3], w5[0], selector);
      w6[0] = hc_byte_perm (w4[2], w4[3], selector);
      w5[3] = hc_byte_perm (w4[1], w4[2], selector);
      w5[2] = hc_byte_perm (w4[0], w4[1], selector);
      w5[1] = hc_byte_perm (w3[3], w4[0], selector);
      w5[0] = hc_byte_perm (w3[2], w3[3], selector);
      w4[3] = hc_byte_perm (w3[1], w3[2], selector);
      w4[2] = hc_byte_perm (w3[0], w3[1], selector);
      w4[1] = hc_byte_perm (w2[3], w3[0], selector);
      w4[0] = hc_byte_perm (w2[2], w2[3], selector);
      w3[3] = hc_byte_perm (w2[1], w2[2], selector);
      w3[2] = hc_byte_perm (w2[0], w2[1], selector);
      w3[1] = hc_byte_perm (w1[3], w2[0], selector);
      w3[0] = hc_byte_perm (w1[2], w1[3], selector);
      w2[3] = hc_byte_perm (w1[1], w1[2], selector);
      w2[2] = hc_byte_perm (w1[0], w1[1], selector);
      w2[1] = hc_byte_perm (w0[3], w1[0], selector);
      w2[0] = hc_byte_perm (w0[2], w0[3], selector);
      w1[3] = hc_byte_perm (w0[1], w0[2], selector);
      w1[2] = hc_byte_perm (w0[0], w0[1], selector);
      w1[1] = hc_byte_perm (    0, w0[0], selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm (w7[3],     0, selector);
      c1[1] = hc_byte_perm (w7[2], w7[3], selector);
      c1[0] = hc_byte_perm (w7[1], w7[2], selector);
      c0[3] = hc_byte_perm (w7[0], w7[1], selector);
      c0[2] = hc_byte_perm (w6[3], w7[0], selector);
      c0[1] = hc_byte_perm (w6[2], w6[3], selector);
      c0[0] = hc_byte_perm (w6[1], w6[2], selector);
      w7[3] = hc_byte_perm (w6[0], w6[1], selector);
      w7[2] = hc_byte_perm (w5[3], w6[0], selector);
      w7[1] = hc_byte_perm (w5[2], w5[3], selector);
      w7[0] = hc_byte_perm (w5[1], w5[2], selector);
      w6[3] = hc_byte_perm (w5[0], w5[1], selector);
      w6[2] = hc_byte_perm (w4[3], w5[0], selector);
      w6[1] = hc_byte_perm (w4[2], w4[3], selector);
      w6[0] = hc_byte_perm (w4[1], w4[2], selector);
      w5[3] = hc_byte_perm (w4[0], w4[1], selector);
      w5[2] = hc_byte_perm (w3[3], w4[0], selector);
      w5[1] = hc_byte_perm (w3[2], w3[3], selector);
      w5[0] = hc_byte_perm (w3[1], w3[2], selector);
      w4[3] = hc_byte_perm (w3[0], w3[1], selector);
      w4[2] = hc_byte_perm (w2[3], w3[0], selector);
      w4[1] = hc_byte_perm (w2[2], w2[3], selector);
      w4[0] = hc_byte_perm (w2[1], w2[2], selector);
      w3[3] = hc_byte_perm (w2[0], w2[1], selector);
      w3[2] = hc_byte_perm (w1[3], w2[0], selector);
      w3[1] = hc_byte_perm (w1[2], w1[3], selector);
      w3[0] = hc_byte_perm (w1[1], w1[2], selector);
      w2[3] = hc_byte_perm (w1[0], w1[1], selector);
      w2[2] = hc_byte_perm (w0[3], w1[0], selector);
      w2[1] = hc_byte_perm (w0[2], w0[3], selector);
      w2[0] = hc_byte_perm (w0[1], w0[2], selector);
      w1[3] = hc_byte_perm (w0[0], w0[1], selector);
      w1[2] = hc_byte_perm (    0, w0[0], selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm (w7[3],     0, selector);
      c1[2] = hc_byte_perm (w7[2], w7[3], selector);
      c1[1] = hc_byte_perm (w7[1], w7[2], selector);
      c1[0] = hc_byte_perm (w7[0], w7[1], selector);
      c0[3] = hc_byte_perm (w6[3], w7[0], selector);
      c0[2] = hc_byte_perm (w6[2], w6[3], selector);
      c0[1] = hc_byte_perm (w6[1], w6[2], selector);
      c0[0] = hc_byte_perm (w6[0], w6[1], selector);
      w7[3] = hc_byte_perm (w5[3], w6[0], selector);
      w7[2] = hc_byte_perm (w5[2], w5[3], selector);
      w7[1] = hc_byte_perm (w5[1], w5[2], selector);
      w7[0] = hc_byte_perm (w5[0], w5[1], selector);
      w6[3] = hc_byte_perm (w4[3], w5[0], selector);
      w6[2] = hc_byte_perm (w4[2], w4[3], selector);
      w6[1] = hc_byte_perm (w4[1], w4[2], selector);
      w6[0] = hc_byte_perm (w4[0], w4[1], selector);
      w5[3] = hc_byte_perm (w3[3], w4[0], selector);
      w5[2] = hc_byte_perm (w3[2], w3[3], selector);
      w5[1] = hc_byte_perm (w3[1], w3[2], selector);
      w5[0] = hc_byte_perm (w3[0], w3[1], selector);
      w4[3] = hc_byte_perm (w2[3], w3[0], selector);
      w4[2] = hc_byte_perm (w2[2], w2[3], selector);
      w4[1] = hc_byte_perm (w2[1], w2[2], selector);
      w4[0] = hc_byte_perm (w2[0], w2[1], selector);
      w3[3] = hc_byte_perm (w1[3], w2[0], selector);
      w3[2] = hc_byte_perm (w1[2], w1[3], selector);
      w3[1] = hc_byte_perm (w1[1], w1[2], selector);
      w3[0] = hc_byte_perm (w1[0], w1[1], selector);
      w2[3] = hc_byte_perm (w0[3], w1[0], selector);
      w2[2] = hc_byte_perm (w0[2], w0[3], selector);
      w2[1] = hc_byte_perm (w0[1], w0[2], selector);
      w2[0] = hc_byte_perm (w0[0], w0[1], selector);
      w1[3] = hc_byte_perm (    0, w0[0], selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm (w7[3],     0, selector);
      c1[3] = hc_byte_perm (w7[2], w7[3], selector);
      c1[2] = hc_byte_perm (w7[1], w7[2], selector);
      c1[1] = hc_byte_perm (w7[0], w7[1], selector);
      c1[0] = hc_byte_perm (w6[3], w7[0], selector);
      c0[3] = hc_byte_perm (w6[2], w6[3], selector);
      c0[2] = hc_byte_perm (w6[1], w6[2], selector);
      c0[1] = hc_byte_perm (w6[0], w6[1], selector);
      c0[0] = hc_byte_perm (w5[3], w6[0], selector);
      w7[3] = hc_byte_perm (w5[2], w5[3], selector);
      w7[2] = hc_byte_perm (w5[1], w5[2], selector);
      w7[1] = hc_byte_perm (w5[0], w5[1], selector);
      w7[0] = hc_byte_perm (w4[3], w5[0], selector);
      w6[3] = hc_byte_perm (w4[2], w4[3], selector);
      w6[2] = hc_byte_perm (w4[1], w4[2], selector);
      w6[1] = hc_byte_perm (w4[0], w4[1], selector);
      w6[0] = hc_byte_perm (w3[3], w4[0], selector);
      w5[3] = hc_byte_perm (w3[2], w3[3], selector);
      w5[2] = hc_byte_perm (w3[1], w3[2], selector);
      w5[1] = hc_byte_perm (w3[0], w3[1], selector);
      w5[0] = hc_byte_perm (w2[3], w3[0], selector);
      w4[3] = hc_byte_perm (w2[2], w2[3], selector);
      w4[2] = hc_byte_perm (w2[1], w2[2], selector);
      w4[1] = hc_byte_perm (w2[0], w2[1], selector);
      w4[0] = hc_byte_perm (w1[3], w2[0], selector);
      w3[3] = hc_byte_perm (w1[2], w1[3], selector);
      w3[2] = hc_byte_perm (w1[1], w1[2], selector);
      w3[1] = hc_byte_perm (w1[0], w1[1], selector);
      w3[0] = hc_byte_perm (w0[3], w1[0], selector);
      w2[3] = hc_byte_perm (w0[2], w0[3], selector);
      w2[2] = hc_byte_perm (w0[1], w0[2], selector);
      w2[1] = hc_byte_perm (w0[0], w0[1], selector);
      w2[0] = hc_byte_perm (    0, w0[0], selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm (w7[3],     0, selector);
      c2[0] = hc_byte_perm (w7[2], w7[3], selector);
      c1[3] = hc_byte_perm (w7[1], w7[2], selector);
      c1[2] = hc_byte_perm (w7[0], w7[1], selector);
      c1[1] = hc_byte_perm (w6[3], w7[0], selector);
      c1[0] = hc_byte_perm (w6[2], w6[3], selector);
      c0[3] = hc_byte_perm (w6[1], w6[2], selector);
      c0[2] = hc_byte_perm (w6[0], w6[1], selector);
      c0[1] = hc_byte_perm (w5[3], w6[0], selector);
      c0[0] = hc_byte_perm (w5[2], w5[3], selector);
      w7[3] = hc_byte_perm (w5[1], w5[2], selector);
      w7[2] = hc_byte_perm (w5[0], w5[1], selector);
      w7[1] = hc_byte_perm (w4[3], w5[0], selector);
      w7[0] = hc_byte_perm (w4[2], w4[3], selector);
      w6[3] = hc_byte_perm (w4[1], w4[2], selector);
      w6[2] = hc_byte_perm (w4[0], w4[1], selector);
      w6[1] = hc_byte_perm (w3[3], w4[0], selector);
      w6[0] = hc_byte_perm (w3[2], w3[3], selector);
      w5[3] = hc_byte_perm (w3[1], w3[2], selector);
      w5[2] = hc_byte_perm (w3[0], w3[1], selector);
      w5[1] = hc_byte_perm (w2[3], w3[0], selector);
      w5[0] = hc_byte_perm (w2[2], w2[3], selector);
      w4[3] = hc_byte_perm (w2[1], w2[2], selector);
      w4[2] = hc_byte_perm (w2[0], w2[1], selector);
      w4[1] = hc_byte_perm (w1[3], w2[0], selector);
      w4[0] = hc_byte_perm (w1[2], w1[3], selector);
      w3[3] = hc_byte_perm (w1[1], w1[2], selector);
      w3[2] = hc_byte_perm (w1[0], w1[1], selector);
      w3[1] = hc_byte_perm (w0[3], w1[0], selector);
      w3[0] = hc_byte_perm (w0[2], w0[3], selector);
      w2[3] = hc_byte_perm (w0[1], w0[2], selector);
      w2[2] = hc_byte_perm (w0[0], w0[1], selector);
      w2[1] = hc_byte_perm (    0, w0[0], selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm (w7[3],     0, selector);
      c2[1] = hc_byte_perm (w7[2], w7[3], selector);
      c2[0] = hc_byte_perm (w7[1], w7[2], selector);
      c1[3] = hc_byte_perm (w7[0], w7[1], selector);
      c1[2] = hc_byte_perm (w6[3], w7[0], selector);
      c1[1] = hc_byte_perm (w6[2], w6[3], selector);
      c1[0] = hc_byte_perm (w6[1], w6[2], selector);
      c0[3] = hc_byte_perm (w6[0], w6[1], selector);
      c0[2] = hc_byte_perm (w5[3], w6[0], selector);
      c0[1] = hc_byte_perm (w5[2], w5[3], selector);
      c0[0] = hc_byte_perm (w5[1], w5[2], selector);
      w7[3] = hc_byte_perm (w5[0], w5[1], selector);
      w7[2] = hc_byte_perm (w4[3], w5[0], selector);
      w7[1] = hc_byte_perm (w4[2], w4[3], selector);
      w7[0] = hc_byte_perm (w4[1], w4[2], selector);
      w6[3] = hc_byte_perm (w4[0], w4[1], selector);
      w6[2] = hc_byte_perm (w3[3], w4[0], selector);
      w6[1] = hc_byte_perm (w3[2], w3[3], selector);
      w6[0] = hc_byte_perm (w3[1], w3[2], selector);
      w5[3] = hc_byte_perm (w3[0], w3[1], selector);
      w5[2] = hc_byte_perm (w2[3], w3[0], selector);
      w5[1] = hc_byte_perm (w2[2], w2[3], selector);
      w5[0] = hc_byte_perm (w2[1], w2[2], selector);
      w4[3] = hc_byte_perm (w2[0], w2[1], selector);
      w4[2] = hc_byte_perm (w1[3], w2[0], selector);
      w4[1] = hc_byte_perm (w1[2], w1[3], selector);
      w4[0] = hc_byte_perm (w1[1], w1[2], selector);
      w3[3] = hc_byte_perm (w1[0], w1[1], selector);
      w3[2] = hc_byte_perm (w0[3], w1[0], selector);
      w3[1] = hc_byte_perm (w0[2], w0[3], selector);
      w3[0] = hc_byte_perm (w0[1], w0[2], selector);
      w2[3] = hc_byte_perm (w0[0], w0[1], selector);
      w2[2] = hc_byte_perm (    0, w0[0], selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm (w7[3],     0, selector);
      c2[2] = hc_byte_perm (w7[2], w7[3], selector);
      c2[1] = hc_byte_perm (w7[1], w7[2], selector);
      c2[0] = hc_byte_perm (w7[0], w7[1], selector);
      c1[3] = hc_byte_perm (w6[3], w7[0], selector);
      c1[2] = hc_byte_perm (w6[2], w6[3], selector);
      c1[1] = hc_byte_perm (w6[1], w6[2], selector);
      c1[0] = hc_byte_perm (w6[0], w6[1], selector);
      c0[3] = hc_byte_perm (w5[3], w6[0], selector);
      c0[2] = hc_byte_perm (w5[2], w5[3], selector);
      c0[1] = hc_byte_perm (w5[1], w5[2], selector);
      c0[0] = hc_byte_perm (w5[0], w5[1], selector);
      w7[3] = hc_byte_perm (w4[3], w5[0], selector);
      w7[2] = hc_byte_perm (w4[2], w4[3], selector);
      w7[1] = hc_byte_perm (w4[1], w4[2], selector);
      w7[0] = hc_byte_perm (w4[0], w4[1], selector);
      w6[3] = hc_byte_perm (w3[3], w4[0], selector);
      w6[2] = hc_byte_perm (w3[2], w3[3], selector);
      w6[1] = hc_byte_perm (w3[1], w3[2], selector);
      w6[0] = hc_byte_perm (w3[0], w3[1], selector);
      w5[3] = hc_byte_perm (w2[3], w3[0], selector);
      w5[2] = hc_byte_perm (w2[2], w2[3], selector);
      w5[1] = hc_byte_perm (w2[1], w2[2], selector);
      w5[0] = hc_byte_perm (w2[0], w2[1], selector);
      w4[3] = hc_byte_perm (w1[3], w2[0], selector);
      w4[2] = hc_byte_perm (w1[2], w1[3], selector);
      w4[1] = hc_byte_perm (w1[1], w1[2], selector);
      w4[0] = hc_byte_perm (w1[0], w1[1], selector);
      w3[3] = hc_byte_perm (w0[3], w1[0], selector);
      w3[2] = hc_byte_perm (w0[2], w0[3], selector);
      w3[1] = hc_byte_perm (w0[1], w0[2], selector);
      w3[0] = hc_byte_perm (w0[0], w0[1], selector);
      w2[3] = hc_byte_perm (    0, w0[0], selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm (w7[3],     0, selector);
      c2[3] = hc_byte_perm (w7[2], w7[3], selector);
      c2[2] = hc_byte_perm (w7[1], w7[2], selector);
      c2[1] = hc_byte_perm (w7[0], w7[1], selector);
      c2[0] = hc_byte_perm (w6[3], w7[0], selector);
      c1[3] = hc_byte_perm (w6[2], w6[3], selector);
      c1[2] = hc_byte_perm (w6[1], w6[2], selector);
      c1[1] = hc_byte_perm (w6[0], w6[1], selector);
      c1[0] = hc_byte_perm (w5[3], w6[0], selector);
      c0[3] = hc_byte_perm (w5[2], w5[3], selector);
      c0[2] = hc_byte_perm (w5[1], w5[2], selector);
      c0[1] = hc_byte_perm (w5[0], w5[1], selector);
      c0[0] = hc_byte_perm (w4[3], w5[0], selector);
      w7[3] = hc_byte_perm (w4[2], w4[3], selector);
      w7[2] = hc_byte_perm (w4[1], w4[2], selector);
      w7[1] = hc_byte_perm (w4[0], w4[1], selector);
      w7[0] = hc_byte_perm (w3[3], w4[0], selector);
      w6[3] = hc_byte_perm (w3[2], w3[3], selector);
      w6[2] = hc_byte_perm (w3[1], w3[2], selector);
      w6[1] = hc_byte_perm (w3[0], w3[1], selector);
      w6[0] = hc_byte_perm (w2[3], w3[0], selector);
      w5[3] = hc_byte_perm (w2[2], w2[3], selector);
      w5[2] = hc_byte_perm (w2[1], w2[2], selector);
      w5[1] = hc_byte_perm (w2[0], w2[1], selector);
      w5[0] = hc_byte_perm (w1[3], w2[0], selector);
      w4[3] = hc_byte_perm (w1[2], w1[3], selector);
      w4[2] = hc_byte_perm (w1[1], w1[2], selector);
      w4[1] = hc_byte_perm (w1[0], w1[1], selector);
      w4[0] = hc_byte_perm (w0[3], w1[0], selector);
      w3[3] = hc_byte_perm (w0[2], w0[3], selector);
      w3[2] = hc_byte_perm (w0[1], w0[2], selector);
      w3[1] = hc_byte_perm (w0[0], w0[1], selector);
      w3[0] = hc_byte_perm (    0, w0[0], selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm (w7[3],     0, selector);
      c3[0] = hc_byte_perm (w7[2], w7[3], selector);
      c2[3] = hc_byte_perm (w7[1], w7[2], selector);
      c2[2] = hc_byte_perm (w7[0], w7[1], selector);
      c2[1] = hc_byte_perm (w6[3], w7[0], selector);
      c2[0] = hc_byte_perm (w6[2], w6[3], selector);
      c1[3] = hc_byte_perm (w6[1], w6[2], selector);
      c1[2] = hc_byte_perm (w6[0], w6[1], selector);
      c1[1] = hc_byte_perm (w5[3], w6[0], selector);
      c1[0] = hc_byte_perm (w5[2], w5[3], selector);
      c0[3] = hc_byte_perm (w5[1], w5[2], selector);
      c0[2] = hc_byte_perm (w5[0], w5[1], selector);
      c0[1] = hc_byte_perm (w4[3], w5[0], selector);
      c0[0] = hc_byte_perm (w4[2], w4[3], selector);
      w7[3] = hc_byte_perm (w4[1], w4[2], selector);
      w7[2] = hc_byte_perm (w4[0], w4[1], selector);
      w7[1] = hc_byte_perm (w3[3], w4[0], selector);
      w7[0] = hc_byte_perm (w3[2], w3[3], selector);
      w6[3] = hc_byte_perm (w3[1], w3[2], selector);
      w6[2] = hc_byte_perm (w3[0], w3[1], selector);
      w6[1] = hc_byte_perm (w2[3], w3[0], selector);
      w6[0] = hc_byte_perm (w2[2], w2[3], selector);
      w5[3] = hc_byte_perm (w2[1], w2[2], selector);
      w5[2] = hc_byte_perm (w2[0], w2[1], selector);
      w5[1] = hc_byte_perm (w1[3], w2[0], selector);
      w5[0] = hc_byte_perm (w1[2], w1[3], selector);
      w4[3] = hc_byte_perm (w1[1], w1[2], selector);
      w4[2] = hc_byte_perm (w1[0], w1[1], selector);
      w4[1] = hc_byte_perm (w0[3], w1[0], selector);
      w4[0] = hc_byte_perm (w0[2], w0[3], selector);
      w3[3] = hc_byte_perm (w0[1], w0[2], selector);
      w3[2] = hc_byte_perm (w0[0], w0[1], selector);
      w3[1] = hc_byte_perm (    0, w0[0], selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm (w7[3],     0, selector);
      c3[1] = hc_byte_perm (w7[2], w7[3], selector);
      c3[0] = hc_byte_perm (w7[1], w7[2], selector);
      c2[3] = hc_byte_perm (w7[0], w7[1], selector);
      c2[2] = hc_byte_perm (w6[3], w7[0], selector);
      c2[1] = hc_byte_perm (w6[2], w6[3], selector);
      c2[0] = hc_byte_perm (w6[1], w6[2], selector);
      c1[3] = hc_byte_perm (w6[0], w6[1], selector);
      c1[2] = hc_byte_perm (w5[3], w6[0], selector);
      c1[1] = hc_byte_perm (w5[2], w5[3], selector);
      c1[0] = hc_byte_perm (w5[1], w5[2], selector);
      c0[3] = hc_byte_perm (w5[0], w5[1], selector);
      c0[2] = hc_byte_perm (w4[3], w5[0], selector);
      c0[1] = hc_byte_perm (w4[2], w4[3], selector);
      c0[0] = hc_byte_perm (w4[1], w4[2], selector);
      w7[3] = hc_byte_perm (w4[0], w4[1], selector);
      w7[2] = hc_byte_perm (w3[3], w4[0], selector);
      w7[1] = hc_byte_perm (w3[2], w3[3], selector);
      w7[0] = hc_byte_perm (w3[1], w3[2], selector);
      w6[3] = hc_byte_perm (w3[0], w3[1], selector);
      w6[2] = hc_byte_perm (w2[3], w3[0], selector);
      w6[1] = hc_byte_perm (w2[2], w2[3], selector);
      w6[0] = hc_byte_perm (w2[1], w2[2], selector);
      w5[3] = hc_byte_perm (w2[0], w2[1], selector);
      w5[2] = hc_byte_perm (w1[3], w2[0], selector);
      w5[1] = hc_byte_perm (w1[2], w1[3], selector);
      w5[0] = hc_byte_perm (w1[1], w1[2], selector);
      w4[3] = hc_byte_perm (w1[0], w1[1], selector);
      w4[2] = hc_byte_perm (w0[3], w1[0], selector);
      w4[1] = hc_byte_perm (w0[2], w0[3], selector);
      w4[0] = hc_byte_perm (w0[1], w0[2], selector);
      w3[3] = hc_byte_perm (w0[0], w0[1], selector);
      w3[2] = hc_byte_perm (    0, w0[0], selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm (w7[3],     0, selector);
      c3[2] = hc_byte_perm (w7[2], w7[3], selector);
      c3[1] = hc_byte_perm (w7[1], w7[2], selector);
      c3[0] = hc_byte_perm (w7[0], w7[1], selector);
      c2[3] = hc_byte_perm (w6[3], w7[0], selector);
      c2[2] = hc_byte_perm (w6[2], w6[3], selector);
      c2[1] = hc_byte_perm (w6[1], w6[2], selector);
      c2[0] = hc_byte_perm (w6[0], w6[1], selector);
      c1[3] = hc_byte_perm (w5[3], w6[0], selector);
      c1[2] = hc_byte_perm (w5[2], w5[3], selector);
      c1[1] = hc_byte_perm (w5[1], w5[2], selector);
      c1[0] = hc_byte_perm (w5[0], w5[1], selector);
      c0[3] = hc_byte_perm (w4[3], w5[0], selector);
      c0[2] = hc_byte_perm (w4[2], w4[3], selector);
      c0[1] = hc_byte_perm (w4[1], w4[2], selector);
      c0[0] = hc_byte_perm (w4[0], w4[1], selector);
      w7[3] = hc_byte_perm (w3[3], w4[0], selector);
      w7[2] = hc_byte_perm (w3[2], w3[3], selector);
      w7[1] = hc_byte_perm (w3[1], w3[2], selector);
      w7[0] = hc_byte_perm (w3[0], w3[1], selector);
      w6[3] = hc_byte_perm (w2[3], w3[0], selector);
      w6[2] = hc_byte_perm (w2[2], w2[3], selector);
      w6[1] = hc_byte_perm (w2[1], w2[2], selector);
      w6[0] = hc_byte_perm (w2[0], w2[1], selector);
      w5[3] = hc_byte_perm (w1[3], w2[0], selector);
      w5[2] = hc_byte_perm (w1[2], w1[3], selector);
      w5[1] = hc_byte_perm (w1[1], w1[2], selector);
      w5[0] = hc_byte_perm (w1[0], w1[1], selector);
      w4[3] = hc_byte_perm (w0[3], w1[0], selector);
      w4[2] = hc_byte_perm (w0[2], w0[3], selector);
      w4[1] = hc_byte_perm (w0[1], w0[2], selector);
      w4[0] = hc_byte_perm (w0[0], w0[1], selector);
      w3[3] = hc_byte_perm (    0, w0[0], selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_byte_perm (w7[3],     0, selector);
      c3[3] = hc_byte_perm (w7[2], w7[3], selector);
      c3[2] = hc_byte_perm (w7[1], w7[2], selector);
      c3[1] = hc_byte_perm (w7[0], w7[1], selector);
      c3[0] = hc_byte_perm (w6[3], w7[0], selector);
      c2[3] = hc_byte_perm (w6[2], w6[3], selector);
      c2[2] = hc_byte_perm (w6[1], w6[2], selector);
      c2[1] = hc_byte_perm (w6[0], w6[1], selector);
      c2[0] = hc_byte_perm (w5[3], w6[0], selector);
      c1[3] = hc_byte_perm (w5[2], w5[3], selector);
      c1[2] = hc_byte_perm (w5[1], w5[2], selector);
      c1[1] = hc_byte_perm (w5[0], w5[1], selector);
      c1[0] = hc_byte_perm (w4[3], w5[0], selector);
      c0[3] = hc_byte_perm (w4[2], w4[3], selector);
      c0[2] = hc_byte_perm (w4[1], w4[2], selector);
      c0[1] = hc_byte_perm (w4[0], w4[1], selector);
      c0[0] = hc_byte_perm (w3[3], w4[0], selector);
      w7[3] = hc_byte_perm (w3[2], w3[3], selector);
      w7[2] = hc_byte_perm (w3[1], w3[2], selector);
      w7[1] = hc_byte_perm (w3[0], w3[1], selector);
      w7[0] = hc_byte_perm (w2[3], w3[0], selector);
      w6[3] = hc_byte_perm (w2[2], w2[3], selector);
      w6[2] = hc_byte_perm (w2[1], w2[2], selector);
      w6[1] = hc_byte_perm (w2[0], w2[1], selector);
      w6[0] = hc_byte_perm (w1[3], w2[0], selector);
      w5[3] = hc_byte_perm (w1[2], w1[3], selector);
      w5[2] = hc_byte_perm (w1[1], w1[2], selector);
      w5[1] = hc_byte_perm (w1[0], w1[1], selector);
      w5[0] = hc_byte_perm (w0[3], w1[0], selector);
      w4[3] = hc_byte_perm (w0[2], w0[3], selector);
      w4[2] = hc_byte_perm (w0[1], w0[2], selector);
      w4[1] = hc_byte_perm (w0[0], w0[1], selector);
      w4[0] = hc_byte_perm (    0, w0[0], selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_byte_perm (w7[3],     0, selector);
      c4[0] = hc_byte_perm (w7[2], w7[3], selector);
      c3[3] = hc_byte_perm (w7[1], w7[2], selector);
      c3[2] = hc_byte_perm (w7[0], w7[1], selector);
      c3[1] = hc_byte_perm (w6[3], w7[0], selector);
      c3[0] = hc_byte_perm (w6[2], w6[3], selector);
      c2[3] = hc_byte_perm (w6[1], w6[2], selector);
      c2[2] = hc_byte_perm (w6[0], w6[1], selector);
      c2[1] = hc_byte_perm (w5[3], w6[0], selector);
      c2[0] = hc_byte_perm (w5[2], w5[3], selector);
      c1[3] = hc_byte_perm (w5[1], w5[2], selector);
      c1[2] = hc_byte_perm (w5[0], w5[1], selector);
      c1[1] = hc_byte_perm (w4[3], w5[0], selector);
      c1[0] = hc_byte_perm (w4[2], w4[3], selector);
      c0[3] = hc_byte_perm (w4[1], w4[2], selector);
      c0[2] = hc_byte_perm (w4[0], w4[1], selector);
      c0[1] = hc_byte_perm (w3[3], w4[0], selector);
      c0[0] = hc_byte_perm (w3[2], w3[3], selector);
      w7[3] = hc_byte_perm (w3[1], w3[2], selector);
      w7[2] = hc_byte_perm (w3[0], w3[1], selector);
      w7[1] = hc_byte_perm (w2[3], w3[0], selector);
      w7[0] = hc_byte_perm (w2[2], w2[3], selector);
      w6[3] = hc_byte_perm (w2[1], w2[2], selector);
      w6[2] = hc_byte_perm (w2[0], w2[1], selector);
      w6[1] = hc_byte_perm (w1[3], w2[0], selector);
      w6[0] = hc_byte_perm (w1[2], w1[3], selector);
      w5[3] = hc_byte_perm (w1[1], w1[2], selector);
      w5[2] = hc_byte_perm (w1[0], w1[1], selector);
      w5[1] = hc_byte_perm (w0[3], w1[0], selector);
      w5[0] = hc_byte_perm (w0[2], w0[3], selector);
      w4[3] = hc_byte_perm (w0[1], w0[2], selector);
      w4[2] = hc_byte_perm (w0[0], w0[1], selector);
      w4[1] = hc_byte_perm (    0, w0[0], selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_byte_perm (w7[3],     0, selector);
      c4[1] = hc_byte_perm (w7[2], w7[3], selector);
      c4[0] = hc_byte_perm (w7[1], w7[2], selector);
      c3[3] = hc_byte_perm (w7[0], w7[1], selector);
      c3[2] = hc_byte_perm (w6[3], w7[0], selector);
      c3[1] = hc_byte_perm (w6[2], w6[3], selector);
      c3[0] = hc_byte_perm (w6[1], w6[2], selector);
      c2[3] = hc_byte_perm (w6[0], w6[1], selector);
      c2[2] = hc_byte_perm (w5[3], w6[0], selector);
      c2[1] = hc_byte_perm (w5[2], w5[3], selector);
      c2[0] = hc_byte_perm (w5[1], w5[2], selector);
      c1[3] = hc_byte_perm (w5[0], w5[1], selector);
      c1[2] = hc_byte_perm (w4[3], w5[0], selector);
      c1[1] = hc_byte_perm (w4[2], w4[3], selector);
      c1[0] = hc_byte_perm (w4[1], w4[2], selector);
      c0[3] = hc_byte_perm (w4[0], w4[1], selector);
      c0[2] = hc_byte_perm (w3[3], w4[0], selector);
      c0[1] = hc_byte_perm (w3[2], w3[3], selector);
      c0[0] = hc_byte_perm (w3[1], w3[2], selector);
      w7[3] = hc_byte_perm (w3[0], w3[1], selector);
      w7[2] = hc_byte_perm (w2[3], w3[0], selector);
      w7[1] = hc_byte_perm (w2[2], w2[3], selector);
      w7[0] = hc_byte_perm (w2[1], w2[2], selector);
      w6[3] = hc_byte_perm (w2[0], w2[1], selector);
      w6[2] = hc_byte_perm (w1[3], w2[0], selector);
      w6[1] = hc_byte_perm (w1[2], w1[3], selector);
      w6[0] = hc_byte_perm (w1[1], w1[2], selector);
      w5[3] = hc_byte_perm (w1[0], w1[1], selector);
      w5[2] = hc_byte_perm (w0[3], w1[0], selector);
      w5[1] = hc_byte_perm (w0[2], w0[3], selector);
      w5[0] = hc_byte_perm (w0[1], w0[2], selector);
      w4[3] = hc_byte_perm (w0[0], w0[1], selector);
      w4[2] = hc_byte_perm (    0, w0[0], selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_byte_perm (w7[3],     0, selector);
      c4[2] = hc_byte_perm (w7[2], w7[3], selector);
      c4[1] = hc_byte_perm (w7[1], w7[2], selector);
      c4[0] = hc_byte_perm (w7[0], w7[1], selector);
      c3[3] = hc_byte_perm (w6[3], w7[0], selector);
      c3[2] = hc_byte_perm (w6[2], w6[3], selector);
      c3[1] = hc_byte_perm (w6[1], w6[2], selector);
      c3[0] = hc_byte_perm (w6[0], w6[1], selector);
      c2[3] = hc_byte_perm (w5[3], w6[0], selector);
      c2[2] = hc_byte_perm (w5[2], w5[3], selector);
      c2[1] = hc_byte_perm (w5[1], w5[2], selector);
      c2[0] = hc_byte_perm (w5[0], w5[1], selector);
      c1[3] = hc_byte_perm (w4[3], w5[0], selector);
      c1[2] = hc_byte_perm (w4[2], w4[3], selector);
      c1[1] = hc_byte_perm (w4[1], w4[2], selector);
      c1[0] = hc_byte_perm (w4[0], w4[1], selector);
      c0[3] = hc_byte_perm (w3[3], w4[0], selector);
      c0[2] = hc_byte_perm (w3[2], w3[3], selector);
      c0[1] = hc_byte_perm (w3[1], w3[2], selector);
      c0[0] = hc_byte_perm (w3[0], w3[1], selector);
      w7[3] = hc_byte_perm (w2[3], w3[0], selector);
      w7[2] = hc_byte_perm (w2[2], w2[3], selector);
      w7[1] = hc_byte_perm (w2[1], w2[2], selector);
      w7[0] = hc_byte_perm (w2[0], w2[1], selector);
      w6[3] = hc_byte_perm (w1[3], w2[0], selector);
      w6[2] = hc_byte_perm (w1[2], w1[3], selector);
      w6[1] = hc_byte_perm (w1[1], w1[2], selector);
      w6[0] = hc_byte_perm (w1[0], w1[1], selector);
      w5[3] = hc_byte_perm (w0[3], w1[0], selector);
      w5[2] = hc_byte_perm (w0[2], w0[3], selector);
      w5[1] = hc_byte_perm (w0[1], w0[2], selector);
      w5[0] = hc_byte_perm (w0[0], w0[1], selector);
      w4[3] = hc_byte_perm (    0, w0[0], selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_byte_perm (w7[3],     0, selector);
      c4[3] = hc_byte_perm (w7[2], w7[3], selector);
      c4[2] = hc_byte_perm (w7[1], w7[2], selector);
      c4[1] = hc_byte_perm (w7[0], w7[1], selector);
      c4[0] = hc_byte_perm (w6[3], w7[0], selector);
      c3[3] = hc_byte_perm (w6[2], w6[3], selector);
      c3[2] = hc_byte_perm (w6[1], w6[2], selector);
      c3[1] = hc_byte_perm (w6[0], w6[1], selector);
      c3[0] = hc_byte_perm (w5[3], w6[0], selector);
      c2[3] = hc_byte_perm (w5[2], w5[3], selector);
      c2[2] = hc_byte_perm (w5[1], w5[2], selector);
      c2[1] = hc_byte_perm (w5[0], w5[1], selector);
      c2[0] = hc_byte_perm (w4[3], w5[0], selector);
      c1[3] = hc_byte_perm (w4[2], w4[3], selector);
      c1[2] = hc_byte_perm (w4[1], w4[2], selector);
      c1[1] = hc_byte_perm (w4[0], w4[1], selector);
      c1[0] = hc_byte_perm (w3[3], w4[0], selector);
      c0[3] = hc_byte_perm (w3[2], w3[3], selector);
      c0[2] = hc_byte_perm (w3[1], w3[2], selector);
      c0[1] = hc_byte_perm (w3[0], w3[1], selector);
      c0[0] = hc_byte_perm (w2[3], w3[0], selector);
      w7[3] = hc_byte_perm (w2[2], w2[3], selector);
      w7[2] = hc_byte_perm (w2[1], w2[2], selector);
      w7[1] = hc_byte_perm (w2[0], w2[1], selector);
      w7[0] = hc_byte_perm (w1[3], w2[0], selector);
      w6[3] = hc_byte_perm (w1[2], w1[3], selector);
      w6[2] = hc_byte_perm (w1[1], w1[2], selector);
      w6[1] = hc_byte_perm (w1[0], w1[1], selector);
      w6[0] = hc_byte_perm (w0[3], w1[0], selector);
      w5[3] = hc_byte_perm (w0[2], w0[3], selector);
      w5[2] = hc_byte_perm (w0[1], w0[2], selector);
      w5[1] = hc_byte_perm (w0[0], w0[1], selector);
      w5[0] = hc_byte_perm (    0, w0[0], selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_byte_perm (w7[3],     0, selector);
      c5[0] = hc_byte_perm (w7[2], w7[3], selector);
      c4[3] = hc_byte_perm (w7[1], w7[2], selector);
      c4[2] = hc_byte_perm (w7[0], w7[1], selector);
      c4[1] = hc_byte_perm (w6[3], w7[0], selector);
      c4[0] = hc_byte_perm (w6[2], w6[3], selector);
      c3[3] = hc_byte_perm (w6[1], w6[2], selector);
      c3[2] = hc_byte_perm (w6[0], w6[1], selector);
      c3[1] = hc_byte_perm (w5[3], w6[0], selector);
      c3[0] = hc_byte_perm (w5[2], w5[3], selector);
      c2[3] = hc_byte_perm (w5[1], w5[2], selector);
      c2[2] = hc_byte_perm (w5[0], w5[1], selector);
      c2[1] = hc_byte_perm (w4[3], w5[0], selector);
      c2[0] = hc_byte_perm (w4[2], w4[3], selector);
      c1[3] = hc_byte_perm (w4[1], w4[2], selector);
      c1[2] = hc_byte_perm (w4[0], w4[1], selector);
      c1[1] = hc_byte_perm (w3[3], w4[0], selector);
      c1[0] = hc_byte_perm (w3[2], w3[3], selector);
      c0[3] = hc_byte_perm (w3[1], w3[2], selector);
      c0[2] = hc_byte_perm (w3[0], w3[1], selector);
      c0[1] = hc_byte_perm (w2[3], w3[0], selector);
      c0[0] = hc_byte_perm (w2[2], w2[3], selector);
      w7[3] = hc_byte_perm (w2[1], w2[2], selector);
      w7[2] = hc_byte_perm (w2[0], w2[1], selector);
      w7[1] = hc_byte_perm (w1[3], w2[0], selector);
      w7[0] = hc_byte_perm (w1[2], w1[3], selector);
      w6[3] = hc_byte_perm (w1[1], w1[2], selector);
      w6[2] = hc_byte_perm (w1[0], w1[1], selector);
      w6[1] = hc_byte_perm (w0[3], w1[0], selector);
      w6[0] = hc_byte_perm (w0[2], w0[3], selector);
      w5[3] = hc_byte_perm (w0[1], w0[2], selector);
      w5[2] = hc_byte_perm (w0[0], w0[1], selector);
      w5[1] = hc_byte_perm (    0, w0[0], selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_byte_perm (w7[3],     0, selector);
      c5[1] = hc_byte_perm (w7[2], w7[3], selector);
      c5[0] = hc_byte_perm (w7[1], w7[2], selector);
      c4[3] = hc_byte_perm (w7[0], w7[1], selector);
      c4[2] = hc_byte_perm (w6[3], w7[0], selector);
      c4[1] = hc_byte_perm (w6[2], w6[3], selector);
      c4[0] = hc_byte_perm (w6[1], w6[2], selector);
      c3[3] = hc_byte_perm (w6[0], w6[1], selector);
      c3[2] = hc_byte_perm (w5[3], w6[0], selector);
      c3[1] = hc_byte_perm (w5[2], w5[3], selector);
      c3[0] = hc_byte_perm (w5[1], w5[2], selector);
      c2[3] = hc_byte_perm (w5[0], w5[1], selector);
      c2[2] = hc_byte_perm (w4[3], w5[0], selector);
      c2[1] = hc_byte_perm (w4[2], w4[3], selector);
      c2[0] = hc_byte_perm (w4[1], w4[2], selector);
      c1[3] = hc_byte_perm (w4[0], w4[1], selector);
      c1[2] = hc_byte_perm (w3[3], w4[0], selector);
      c1[1] = hc_byte_perm (w3[2], w3[3], selector);
      c1[0] = hc_byte_perm (w3[1], w3[2], selector);
      c0[3] = hc_byte_perm (w3[0], w3[1], selector);
      c0[2] = hc_byte_perm (w2[3], w3[0], selector);
      c0[1] = hc_byte_perm (w2[2], w2[3], selector);
      c0[0] = hc_byte_perm (w2[1], w2[2], selector);
      w7[3] = hc_byte_perm (w2[0], w2[1], selector);
      w7[2] = hc_byte_perm (w1[3], w2[0], selector);
      w7[1] = hc_byte_perm (w1[2], w1[3], selector);
      w7[0] = hc_byte_perm (w1[1], w1[2], selector);
      w6[3] = hc_byte_perm (w1[0], w1[1], selector);
      w6[2] = hc_byte_perm (w0[3], w1[0], selector);
      w6[1] = hc_byte_perm (w0[2], w0[3], selector);
      w6[0] = hc_byte_perm (w0[1], w0[2], selector);
      w5[3] = hc_byte_perm (w0[0], w0[1], selector);
      w5[2] = hc_byte_perm (    0, w0[0], selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_byte_perm (w7[3],     0, selector);
      c5[2] = hc_byte_perm (w7[2], w7[3], selector);
      c5[1] = hc_byte_perm (w7[1], w7[2], selector);
      c5[0] = hc_byte_perm (w7[0], w7[1], selector);
      c4[3] = hc_byte_perm (w6[3], w7[0], selector);
      c4[2] = hc_byte_perm (w6[2], w6[3], selector);
      c4[1] = hc_byte_perm (w6[1], w6[2], selector);
      c4[0] = hc_byte_perm (w6[0], w6[1], selector);
      c3[3] = hc_byte_perm (w5[3], w6[0], selector);
      c3[2] = hc_byte_perm (w5[2], w5[3], selector);
      c3[1] = hc_byte_perm (w5[1], w5[2], selector);
      c3[0] = hc_byte_perm (w5[0], w5[1], selector);
      c2[3] = hc_byte_perm (w4[3], w5[0], selector);
      c2[2] = hc_byte_perm (w4[2], w4[3], selector);
      c2[1] = hc_byte_perm (w4[1], w4[2], selector);
      c2[0] = hc_byte_perm (w4[0], w4[1], selector);
      c1[3] = hc_byte_perm (w3[3], w4[0], selector);
      c1[2] = hc_byte_perm (w3[2], w3[3], selector);
      c1[1] = hc_byte_perm (w3[1], w3[2], selector);
      c1[0] = hc_byte_perm (w3[0], w3[1], selector);
      c0[3] = hc_byte_perm (w2[3], w3[0], selector);
      c0[2] = hc_byte_perm (w2[2], w2[3], selector);
      c0[1] = hc_byte_perm (w2[1], w2[2], selector);
      c0[0] = hc_byte_perm (w2[0], w2[1], selector);
      w7[3] = hc_byte_perm (w1[3], w2[0], selector);
      w7[2] = hc_byte_perm (w1[2], w1[3], selector);
      w7[1] = hc_byte_perm (w1[1], w1[2], selector);
      w7[0] = hc_byte_perm (w1[0], w1[1], selector);
      w6[3] = hc_byte_perm (w0[3], w1[0], selector);
      w6[2] = hc_byte_perm (w0[2], w0[3], selector);
      w6[1] = hc_byte_perm (w0[1], w0[2], selector);
      w6[0] = hc_byte_perm (w0[0], w0[1], selector);
      w5[3] = hc_byte_perm (    0, w0[0], selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_byte_perm (w7[3],     0, selector);
      c5[3] = hc_byte_perm (w7[2], w7[3], selector);
      c5[2] = hc_byte_perm (w7[1], w7[2], selector);
      c5[1] = hc_byte_perm (w7[0], w7[1], selector);
      c5[0] = hc_byte_perm (w6[3], w7[0], selector);
      c4[3] = hc_byte_perm (w6[2], w6[3], selector);
      c4[2] = hc_byte_perm (w6[1], w6[2], selector);
      c4[1] = hc_byte_perm (w6[0], w6[1], selector);
      c4[0] = hc_byte_perm (w5[3], w6[0], selector);
      c3[3] = hc_byte_perm (w5[2], w5[3], selector);
      c3[2] = hc_byte_perm (w5[1], w5[2], selector);
      c3[1] = hc_byte_perm (w5[0], w5[1], selector);
      c3[0] = hc_byte_perm (w4[3], w5[0], selector);
      c2[3] = hc_byte_perm (w4[2], w4[3], selector);
      c2[2] = hc_byte_perm (w4[1], w4[2], selector);
      c2[1] = hc_byte_perm (w4[0], w4[1], selector);
      c2[0] = hc_byte_perm (w3[3], w4[0], selector);
      c1[3] = hc_byte_perm (w3[2], w3[3], selector);
      c1[2] = hc_byte_perm (w3[1], w3[2], selector);
      c1[1] = hc_byte_perm (w3[0], w3[1], selector);
      c1[0] = hc_byte_perm (w2[3], w3[0], selector);
      c0[3] = hc_byte_perm (w2[2], w2[3], selector);
      c0[2] = hc_byte_perm (w2[1], w2[2], selector);
      c0[1] = hc_byte_perm (w2[0], w2[1], selector);
      c0[0] = hc_byte_perm (w1[3], w2[0], selector);
      w7[3] = hc_byte_perm (w1[2], w1[3], selector);
      w7[2] = hc_byte_perm (w1[1], w1[2], selector);
      w7[1] = hc_byte_perm (w1[0], w1[1], selector);
      w7[0] = hc_byte_perm (w0[3], w1[0], selector);
      w6[3] = hc_byte_perm (w0[2], w0[3], selector);
      w6[2] = hc_byte_perm (w0[1], w0[2], selector);
      w6[1] = hc_byte_perm (w0[0], w0[1], selector);
      w6[0] = hc_byte_perm (    0, w0[0], selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_byte_perm (w7[3],     0, selector);
      c6[0] = hc_byte_perm (w7[2], w7[3], selector);
      c5[3] = hc_byte_perm (w7[1], w7[2], selector);
      c5[2] = hc_byte_perm (w7[0], w7[1], selector);
      c5[1] = hc_byte_perm (w6[3], w7[0], selector);
      c5[0] = hc_byte_perm (w6[2], w6[3], selector);
      c4[3] = hc_byte_perm (w6[1], w6[2], selector);
      c4[2] = hc_byte_perm (w6[0], w6[1], selector);
      c4[1] = hc_byte_perm (w5[3], w6[0], selector);
      c4[0] = hc_byte_perm (w5[2], w5[3], selector);
      c3[3] = hc_byte_perm (w5[1], w5[2], selector);
      c3[2] = hc_byte_perm (w5[0], w5[1], selector);
      c3[1] = hc_byte_perm (w4[3], w5[0], selector);
      c3[0] = hc_byte_perm (w4[2], w4[3], selector);
      c2[3] = hc_byte_perm (w4[1], w4[2], selector);
      c2[2] = hc_byte_perm (w4[0], w4[1], selector);
      c2[1] = hc_byte_perm (w3[3], w4[0], selector);
      c2[0] = hc_byte_perm (w3[2], w3[3], selector);
      c1[3] = hc_byte_perm (w3[1], w3[2], selector);
      c1[2] = hc_byte_perm (w3[0], w3[1], selector);
      c1[1] = hc_byte_perm (w2[3], w3[0], selector);
      c1[0] = hc_byte_perm (w2[2], w2[3], selector);
      c0[3] = hc_byte_perm (w2[1], w2[2], selector);
      c0[2] = hc_byte_perm (w2[0], w2[1], selector);
      c0[1] = hc_byte_perm (w1[3], w2[0], selector);
      c0[0] = hc_byte_perm (w1[2], w1[3], selector);
      w7[3] = hc_byte_perm (w1[1], w1[2], selector);
      w7[2] = hc_byte_perm (w1[0], w1[1], selector);
      w7[1] = hc_byte_perm (w0[3], w1[0], selector);
      w7[0] = hc_byte_perm (w0[2], w0[3], selector);
      w6[3] = hc_byte_perm (w0[1], w0[2], selector);
      w6[2] = hc_byte_perm (w0[0], w0[1], selector);
      w6[1] = hc_byte_perm (    0, w0[0], selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_byte_perm (w7[3],     0, selector);
      c6[1] = hc_byte_perm (w7[2], w7[3], selector);
      c6[0] = hc_byte_perm (w7[1], w7[2], selector);
      c5[3] = hc_byte_perm (w7[0], w7[1], selector);
      c5[2] = hc_byte_perm (w6[3], w7[0], selector);
      c5[1] = hc_byte_perm (w6[2], w6[3], selector);
      c5[0] = hc_byte_perm (w6[1], w6[2], selector);
      c4[3] = hc_byte_perm (w6[0], w6[1], selector);
      c4[2] = hc_byte_perm (w5[3], w6[0], selector);
      c4[1] = hc_byte_perm (w5[2], w5[3], selector);
      c4[0] = hc_byte_perm (w5[1], w5[2], selector);
      c3[3] = hc_byte_perm (w5[0], w5[1], selector);
      c3[2] = hc_byte_perm (w4[3], w5[0], selector);
      c3[1] = hc_byte_perm (w4[2], w4[3], selector);
      c3[0] = hc_byte_perm (w4[1], w4[2], selector);
      c2[3] = hc_byte_perm (w4[0], w4[1], selector);
      c2[2] = hc_byte_perm (w3[3], w4[0], selector);
      c2[1] = hc_byte_perm (w3[2], w3[3], selector);
      c2[0] = hc_byte_perm (w3[1], w3[2], selector);
      c1[3] = hc_byte_perm (w3[0], w3[1], selector);
      c1[2] = hc_byte_perm (w2[3], w3[0], selector);
      c1[1] = hc_byte_perm (w2[2], w2[3], selector);
      c1[0] = hc_byte_perm (w2[1], w2[2], selector);
      c0[3] = hc_byte_perm (w2[0], w2[1], selector);
      c0[2] = hc_byte_perm (w1[3], w2[0], selector);
      c0[1] = hc_byte_perm (w1[2], w1[3], selector);
      c0[0] = hc_byte_perm (w1[1], w1[2], selector);
      w7[3] = hc_byte_perm (w1[0], w1[1], selector);
      w7[2] = hc_byte_perm (w0[3], w1[0], selector);
      w7[1] = hc_byte_perm (w0[2], w0[3], selector);
      w7[0] = hc_byte_perm (w0[1], w0[2], selector);
      w6[3] = hc_byte_perm (w0[0], w0[1], selector);
      w6[2] = hc_byte_perm (    0, w0[0], selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_byte_perm (w7[3],     0, selector);
      c6[2] = hc_byte_perm (w7[2], w7[3], selector);
      c6[1] = hc_byte_perm (w7[1], w7[2], selector);
      c6[0] = hc_byte_perm (w7[0], w7[1], selector);
      c5[3] = hc_byte_perm (w6[3], w7[0], selector);
      c5[2] = hc_byte_perm (w6[2], w6[3], selector);
      c5[1] = hc_byte_perm (w6[1], w6[2], selector);
      c5[0] = hc_byte_perm (w6[0], w6[1], selector);
      c4[3] = hc_byte_perm (w5[3], w6[0], selector);
      c4[2] = hc_byte_perm (w5[2], w5[3], selector);
      c4[1] = hc_byte_perm (w5[1], w5[2], selector);
      c4[0] = hc_byte_perm (w5[0], w5[1], selector);
      c3[3] = hc_byte_perm (w4[3], w5[0], selector);
      c3[2] = hc_byte_perm (w4[2], w4[3], selector);
      c3[1] = hc_byte_perm (w4[1], w4[2], selector);
      c3[0] = hc_byte_perm (w4[0], w4[1], selector);
      c2[3] = hc_byte_perm (w3[3], w4[0], selector);
      c2[2] = hc_byte_perm (w3[2], w3[3], selector);
      c2[1] = hc_byte_perm (w3[1], w3[2], selector);
      c2[0] = hc_byte_perm (w3[0], w3[1], selector);
      c1[3] = hc_byte_perm (w2[3], w3[0], selector);
      c1[2] = hc_byte_perm (w2[2], w2[3], selector);
      c1[1] = hc_byte_perm (w2[1], w2[2], selector);
      c1[0] = hc_byte_perm (w2[0], w2[1], selector);
      c0[3] = hc_byte_perm (w1[3], w2[0], selector);
      c0[2] = hc_byte_perm (w1[2], w1[3], selector);
      c0[1] = hc_byte_perm (w1[1], w1[2], selector);
      c0[0] = hc_byte_perm (w1[0], w1[1], selector);
      w7[3] = hc_byte_perm (w0[3], w1[0], selector);
      w7[2] = hc_byte_perm (w0[2], w0[3], selector);
      w7[1] = hc_byte_perm (w0[1], w0[2], selector);
      w7[0] = hc_byte_perm (w0[0], w0[1], selector);
      w6[3] = hc_byte_perm (    0, w0[0], selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_byte_perm (w7[3],     0, selector);
      c6[3] = hc_byte_perm (w7[2], w7[3], selector);
      c6[2] = hc_byte_perm (w7[1], w7[2], selector);
      c6[1] = hc_byte_perm (w7[0], w7[1], selector);
      c6[0] = hc_byte_perm (w6[3], w7[0], selector);
      c5[3] = hc_byte_perm (w6[2], w6[3], selector);
      c5[2] = hc_byte_perm (w6[1], w6[2], selector);
      c5[1] = hc_byte_perm (w6[0], w6[1], selector);
      c5[0] = hc_byte_perm (w5[3], w6[0], selector);
      c4[3] = hc_byte_perm (w5[2], w5[3], selector);
      c4[2] = hc_byte_perm (w5[1], w5[2], selector);
      c4[1] = hc_byte_perm (w5[0], w5[1], selector);
      c4[0] = hc_byte_perm (w4[3], w5[0], selector);
      c3[3] = hc_byte_perm (w4[2], w4[3], selector);
      c3[2] = hc_byte_perm (w4[1], w4[2], selector);
      c3[1] = hc_byte_perm (w4[0], w4[1], selector);
      c3[0] = hc_byte_perm (w3[3], w4[0], selector);
      c2[3] = hc_byte_perm (w3[2], w3[3], selector);
      c2[2] = hc_byte_perm (w3[1], w3[2], selector);
      c2[1] = hc_byte_perm (w3[0], w3[1], selector);
      c2[0] = hc_byte_perm (w2[3], w3[0], selector);
      c1[3] = hc_byte_perm (w2[2], w2[3], selector);
      c1[2] = hc_byte_perm (w2[1], w2[2], selector);
      c1[1] = hc_byte_perm (w2[0], w2[1], selector);
      c1[0] = hc_byte_perm (w1[3], w2[0], selector);
      c0[3] = hc_byte_perm (w1[2], w1[3], selector);
      c0[2] = hc_byte_perm (w1[1], w1[2], selector);
      c0[1] = hc_byte_perm (w1[0], w1[1], selector);
      c0[0] = hc_byte_perm (w0[3], w1[0], selector);
      w7[3] = hc_byte_perm (w0[2], w0[3], selector);
      w7[2] = hc_byte_perm (w0[1], w0[2], selector);
      w7[1] = hc_byte_perm (w0[0], w0[1], selector);
      w7[0] = hc_byte_perm (    0, w0[0], selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_byte_perm (w7[3],     0, selector);
      c7[0] = hc_byte_perm (w7[2], w7[3], selector);
      c6[3] = hc_byte_perm (w7[1], w7[2], selector);
      c6[2] = hc_byte_perm (w7[0], w7[1], selector);
      c6[1] = hc_byte_perm (w6[3], w7[0], selector);
      c6[0] = hc_byte_perm (w6[2], w6[3], selector);
      c5[3] = hc_byte_perm (w6[1], w6[2], selector);
      c5[2] = hc_byte_perm (w6[0], w6[1], selector);
      c5[1] = hc_byte_perm (w5[3], w6[0], selector);
      c5[0] = hc_byte_perm (w5[2], w5[3], selector);
      c4[3] = hc_byte_perm (w5[1], w5[2], selector);
      c4[2] = hc_byte_perm (w5[0], w5[1], selector);
      c4[1] = hc_byte_perm (w4[3], w5[0], selector);
      c4[0] = hc_byte_perm (w4[2], w4[3], selector);
      c3[3] = hc_byte_perm (w4[1], w4[2], selector);
      c3[2] = hc_byte_perm (w4[0], w4[1], selector);
      c3[1] = hc_byte_perm (w3[3], w4[0], selector);
      c3[0] = hc_byte_perm (w3[2], w3[3], selector);
      c2[3] = hc_byte_perm (w3[1], w3[2], selector);
      c2[2] = hc_byte_perm (w3[0], w3[1], selector);
      c2[1] = hc_byte_perm (w2[3], w3[0], selector);
      c2[0] = hc_byte_perm (w2[2], w2[3], selector);
      c1[3] = hc_byte_perm (w2[1], w2[2], selector);
      c1[2] = hc_byte_perm (w2[0], w2[1], selector);
      c1[1] = hc_byte_perm (w1[3], w2[0], selector);
      c1[0] = hc_byte_perm (w1[2], w1[3], selector);
      c0[3] = hc_byte_perm (w1[1], w1[2], selector);
      c0[2] = hc_byte_perm (w1[0], w1[1], selector);
      c0[1] = hc_byte_perm (w0[3], w1[0], selector);
      c0[0] = hc_byte_perm (w0[2], w0[3], selector);
      w7[3] = hc_byte_perm (w0[1], w0[2], selector);
      w7[2] = hc_byte_perm (w0[0], w0[1], selector);
      w7[1] = hc_byte_perm (    0, w0[0], selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_byte_perm (w7[3],     0, selector);
      c7[1] = hc_byte_perm (w7[2], w7[3], selector);
      c7[0] = hc_byte_perm (w7[1], w7[2], selector);
      c6[3] = hc_byte_perm (w7[0], w7[1], selector);
      c6[2] = hc_byte_perm (w6[3], w7[0], selector);
      c6[1] = hc_byte_perm (w6[2], w6[3], selector);
      c6[0] = hc_byte_perm (w6[1], w6[2], selector);
      c5[3] = hc_byte_perm (w6[0], w6[1], selector);
      c5[2] = hc_byte_perm (w5[3], w6[0], selector);
      c5[1] = hc_byte_perm (w5[2], w5[3], selector);
      c5[0] = hc_byte_perm (w5[1], w5[2], selector);
      c4[3] = hc_byte_perm (w5[0], w5[1], selector);
      c4[2] = hc_byte_perm (w4[3], w5[0], selector);
      c4[1] = hc_byte_perm (w4[2], w4[3], selector);
      c4[0] = hc_byte_perm (w4[1], w4[2], selector);
      c3[3] = hc_byte_perm (w4[0], w4[1], selector);
      c3[2] = hc_byte_perm (w3[3], w4[0], selector);
      c3[1] = hc_byte_perm (w3[2], w3[3], selector);
      c3[0] = hc_byte_perm (w3[1], w3[2], selector);
      c2[3] = hc_byte_perm (w3[0], w3[1], selector);
      c2[2] = hc_byte_perm (w2[3], w3[0], selector);
      c2[1] = hc_byte_perm (w2[2], w2[3], selector);
      c2[0] = hc_byte_perm (w2[1], w2[2], selector);
      c1[3] = hc_byte_perm (w2[0], w2[1], selector);
      c1[2] = hc_byte_perm (w1[3], w2[0], selector);
      c1[1] = hc_byte_perm (w1[2], w1[3], selector);
      c1[0] = hc_byte_perm (w1[1], w1[2], selector);
      c0[3] = hc_byte_perm (w1[0], w1[1], selector);
      c0[2] = hc_byte_perm (w0[3], w1[0], selector);
      c0[1] = hc_byte_perm (w0[2], w0[3], selector);
      c0[0] = hc_byte_perm (w0[1], w0[2], selector);
      w7[3] = hc_byte_perm (w0[0], w0[1], selector);
      w7[2] = hc_byte_perm (    0, w0[0], selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_byte_perm (w7[3],     0, selector);
      c7[2] = hc_byte_perm (w7[2], w7[3], selector);
      c7[1] = hc_byte_perm (w7[1], w7[2], selector);
      c7[0] = hc_byte_perm (w7[0], w7[1], selector);
      c6[3] = hc_byte_perm (w6[3], w7[0], selector);
      c6[2] = hc_byte_perm (w6[2], w6[3], selector);
      c6[1] = hc_byte_perm (w6[1], w6[2], selector);
      c6[0] = hc_byte_perm (w6[0], w6[1], selector);
      c5[3] = hc_byte_perm (w5[3], w6[0], selector);
      c5[2] = hc_byte_perm (w5[2], w5[3], selector);
      c5[1] = hc_byte_perm (w5[1], w5[2], selector);
      c5[0] = hc_byte_perm (w5[0], w5[1], selector);
      c4[3] = hc_byte_perm (w4[3], w5[0], selector);
      c4[2] = hc_byte_perm (w4[2], w4[3], selector);
      c4[1] = hc_byte_perm (w4[1], w4[2], selector);
      c4[0] = hc_byte_perm (w4[0], w4[1], selector);
      c3[3] = hc_byte_perm (w3[3], w4[0], selector);
      c3[2] = hc_byte_perm (w3[2], w3[3], selector);
      c3[1] = hc_byte_perm (w3[1], w3[2], selector);
      c3[0] = hc_byte_perm (w3[0], w3[1], selector);
      c2[3] = hc_byte_perm (w2[3], w3[0], selector);
      c2[2] = hc_byte_perm (w2[2], w2[3], selector);
      c2[1] = hc_byte_perm (w2[1], w2[2], selector);
      c2[0] = hc_byte_perm (w2[0], w2[1], selector);
      c1[3] = hc_byte_perm (w1[3], w2[0], selector);
      c1[2] = hc_byte_perm (w1[2], w1[3], selector);
      c1[1] = hc_byte_perm (w1[1], w1[2], selector);
      c1[0] = hc_byte_perm (w1[0], w1[1], selector);
      c0[3] = hc_byte_perm (w0[3], w1[0], selector);
      c0[2] = hc_byte_perm (w0[2], w0[3], selector);
      c0[1] = hc_byte_perm (w0[1], w0[2], selector);
      c0[0] = hc_byte_perm (w0[0], w0[1], selector);
      w7[3] = hc_byte_perm (    0, w0[0], selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_bytealign_be (w7[2], w7[3], offset);
      w7[2] = hc_bytealign_be (w7[1], w7[2], offset);
      w7[1] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[0] = hc_bytealign_be (w6[3], w7[0], offset);
      w6[3] = hc_bytealign_be (w6[2], w6[3], offset);
      w6[2] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[1] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[0] = hc_bytealign_be (w5[3], w6[0], offset);
      w5[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w5[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w4[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w4[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w3[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_be (    0, w0[0], offset);

      break;

    case  1:
      w7[3] = hc_bytealign_be (w7[1], w7[2], offset);
      w7[2] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[1] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[0] = hc_bytealign_be (w6[2], w6[3], offset);
      w6[3] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[2] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[1] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[0] = hc_bytealign_be (w5[2], w5[3], offset);
      w5[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w4[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_be (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[2] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[1] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[0] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[3] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[2] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[1] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[0] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_be (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[2] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[1] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[0] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[3] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[2] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[1] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[0] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_be (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[2] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[1] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[0] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_be (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[2] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[1] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[0] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_be (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[2] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[1] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[0] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_be (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[2] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[1] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[0] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_be (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_be (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_be (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_be (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_be (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_be (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_be (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_be (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_be (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[0] = hc_bytealign_be (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[1] = hc_bytealign_be (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[2] = hc_bytealign_be (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[3] = hc_bytealign_be (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[0] = hc_bytealign_be (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[1] = hc_bytealign_be (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[2] = hc_bytealign_be (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[3] = hc_bytealign_be (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[0] = hc_bytealign_be (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[1] = hc_bytealign_be (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[2] = hc_bytealign_be (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[3] = hc_bytealign_be (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[0] = hc_bytealign_be (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[1] = hc_bytealign_be (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[2] = hc_bytealign_be (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_bytealign_be (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      w7[3] = hc_byte_perm (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      w7[3] = hc_byte_perm (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      w7[3] = hc_byte_perm (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      w7[3] = hc_byte_perm (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      w7[3] = hc_byte_perm (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      w7[3] = hc_byte_perm (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      w7[3] = hc_byte_perm (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      w7[3] = hc_byte_perm (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      w7[3] = hc_byte_perm (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      w7[3] = hc_byte_perm (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      w7[3] = hc_byte_perm (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      w7[3] = hc_byte_perm (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      w7[3] = hc_byte_perm (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      w7[3] = hc_byte_perm (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      w7[3] = hc_byte_perm (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      w7[3] = hc_byte_perm (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      w7[3] = hc_byte_perm (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      w7[3] = hc_byte_perm (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      w7[3] = hc_byte_perm (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      w7[3] = hc_byte_perm (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      w7[3] = hc_byte_perm (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      w7[3] = hc_byte_perm (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      w7[3] = hc_byte_perm (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      w7[3] = hc_byte_perm (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      w7[3] = hc_byte_perm (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      w7[3] = hc_byte_perm (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      w7[3] = hc_byte_perm (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      w7[3] = hc_byte_perm (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      w7[3] = hc_byte_perm (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      w7[3] = hc_byte_perm (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      w7[3] = hc_byte_perm (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      w7[3] = hc_byte_perm (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_bytealign_be (w7[3],     0, offset);
      w7[3] = hc_bytealign_be (w7[2], w7[3], offset);
      w7[2] = hc_bytealign_be (w7[1], w7[2], offset);
      w7[1] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[0] = hc_bytealign_be (w6[3], w7[0], offset);
      w6[3] = hc_bytealign_be (w6[2], w6[3], offset);
      w6[2] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[1] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[0] = hc_bytealign_be (w5[3], w6[0], offset);
      w5[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w5[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w4[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w4[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w3[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w2[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w1[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w0[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[0] = hc_bytealign_be (    0, w0[0], offset);

      break;

    case  1:
      c0[1] = hc_bytealign_be (w7[3],     0, offset);
      c0[0] = hc_bytealign_be (w7[2], w7[3], offset);
      w7[3] = hc_bytealign_be (w7[1], w7[2], offset);
      w7[2] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[1] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[0] = hc_bytealign_be (w6[2], w6[3], offset);
      w6[3] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[2] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[1] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[0] = hc_bytealign_be (w5[2], w5[3], offset);
      w5[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w4[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w3[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w2[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w1[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w0[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[1] = hc_bytealign_be (    0, w0[0], offset);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_bytealign_be (w7[3],     0, offset);
      c0[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c0[0] = hc_bytealign_be (w7[1], w7[2], offset);
      w7[3] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[2] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[1] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[0] = hc_bytealign_be (w6[1], w6[2], offset);
      w6[3] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[2] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[1] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[0] = hc_bytealign_be (w5[1], w5[2], offset);
      w5[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w4[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w3[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w2[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w1[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w0[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[2] = hc_bytealign_be (    0, w0[0], offset);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_bytealign_be (w7[3],     0, offset);
      c0[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c0[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c0[0] = hc_bytealign_be (w7[0], w7[1], offset);
      w7[3] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[2] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[1] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[0] = hc_bytealign_be (w6[0], w6[1], offset);
      w6[3] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[2] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[1] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[0] = hc_bytealign_be (w5[0], w5[1], offset);
      w5[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w4[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w3[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w2[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w1[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w0[3] = hc_bytealign_be (    0, w0[0], offset);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_bytealign_be (w7[3],     0, offset);
      c0[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c0[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c0[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c0[0] = hc_bytealign_be (w6[3], w7[0], offset);
      w7[3] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[2] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[1] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[0] = hc_bytealign_be (w5[3], w6[0], offset);
      w6[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w5[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w4[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w3[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w2[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w1[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[0] = hc_bytealign_be (    0, w0[0], offset);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_bytealign_be (w7[3],     0, offset);
      c1[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c0[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c0[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c0[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c0[0] = hc_bytealign_be (w6[2], w6[3], offset);
      w7[3] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[2] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[1] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[0] = hc_bytealign_be (w5[2], w5[3], offset);
      w6[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w5[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w4[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w3[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w2[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w1[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[1] = hc_bytealign_be (    0, w0[0], offset);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_bytealign_be (w7[3],     0, offset);
      c1[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c1[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c0[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c0[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c0[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c0[0] = hc_bytealign_be (w6[1], w6[2], offset);
      w7[3] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[2] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[1] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[0] = hc_bytealign_be (w5[1], w5[2], offset);
      w6[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w5[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w4[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w3[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w2[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w1[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[2] = hc_bytealign_be (    0, w0[0], offset);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_bytealign_be (w7[3],     0, offset);
      c1[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c1[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c1[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c0[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c0[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c0[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c0[0] = hc_bytealign_be (w6[0], w6[1], offset);
      w7[3] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[2] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[1] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[0] = hc_bytealign_be (w5[0], w5[1], offset);
      w6[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w5[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w4[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w3[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w2[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w1[3] = hc_bytealign_be (    0, w0[0], offset);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_bytealign_be (w7[3],     0, offset);
      c1[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c1[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c1[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c1[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c0[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c0[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c0[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c0[0] = hc_bytealign_be (w5[3], w6[0], offset);
      w7[3] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[2] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[1] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w6[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w5[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w4[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w3[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w2[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[0] = hc_bytealign_be (    0, w0[0], offset);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_bytealign_be (w7[3],     0, offset);
      c2[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c1[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c1[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c1[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c1[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c0[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c0[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c0[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c0[0] = hc_bytealign_be (w5[2], w5[3], offset);
      w7[3] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[2] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[1] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w6[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w5[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w4[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w3[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w2[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[1] = hc_bytealign_be (    0, w0[0], offset);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_bytealign_be (w7[3],     0, offset);
      c2[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c2[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c1[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c1[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c1[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c1[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c0[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c0[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c0[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c0[0] = hc_bytealign_be (w5[1], w5[2], offset);
      w7[3] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[2] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[1] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w6[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w5[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w4[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w3[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w2[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[2] = hc_bytealign_be (    0, w0[0], offset);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_bytealign_be (w7[3],     0, offset);
      c2[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c2[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c2[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c1[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c1[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c1[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c1[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c0[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c0[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c0[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c0[0] = hc_bytealign_be (w5[0], w5[1], offset);
      w7[3] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[2] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[1] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w6[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w5[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w4[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w3[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w2[3] = hc_bytealign_be (    0, w0[0], offset);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_bytealign_be (w7[3],     0, offset);
      c2[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c2[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c2[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c2[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c1[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c1[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c1[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c1[0] = hc_bytealign_be (w5[3], w6[0], offset);
      c0[3] = hc_bytealign_be (w5[2], w5[3], offset);
      c0[2] = hc_bytealign_be (w5[1], w5[2], offset);
      c0[1] = hc_bytealign_be (w5[0], w5[1], offset);
      c0[0] = hc_bytealign_be (w4[3], w5[0], offset);
      w7[3] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[2] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[1] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w6[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w5[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w4[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w3[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[0] = hc_bytealign_be (    0, w0[0], offset);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_bytealign_be (w7[3],     0, offset);
      c3[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c2[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c2[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c2[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c2[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c1[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c1[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c1[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c1[0] = hc_bytealign_be (w5[2], w5[3], offset);
      c0[3] = hc_bytealign_be (w5[1], w5[2], offset);
      c0[2] = hc_bytealign_be (w5[0], w5[1], offset);
      c0[1] = hc_bytealign_be (w4[3], w5[0], offset);
      c0[0] = hc_bytealign_be (w4[2], w4[3], offset);
      w7[3] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[2] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[1] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w6[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w5[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w4[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w3[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[1] = hc_bytealign_be (    0, w0[0], offset);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_bytealign_be (w7[3],     0, offset);
      c3[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c3[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c2[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c2[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c2[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c2[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c1[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c1[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c1[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c1[0] = hc_bytealign_be (w5[1], w5[2], offset);
      c0[3] = hc_bytealign_be (w5[0], w5[1], offset);
      c0[2] = hc_bytealign_be (w4[3], w5[0], offset);
      c0[1] = hc_bytealign_be (w4[2], w4[3], offset);
      c0[0] = hc_bytealign_be (w4[1], w4[2], offset);
      w7[3] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[2] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[1] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w6[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w5[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w4[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w3[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[2] = hc_bytealign_be (    0, w0[0], offset);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_bytealign_be (w7[3],     0, offset);
      c3[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c3[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c3[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c2[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c2[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c2[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c2[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c1[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c1[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c1[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c1[0] = hc_bytealign_be (w5[0], w5[1], offset);
      c0[3] = hc_bytealign_be (w4[3], w5[0], offset);
      c0[2] = hc_bytealign_be (w4[2], w4[3], offset);
      c0[1] = hc_bytealign_be (w4[1], w4[2], offset);
      c0[0] = hc_bytealign_be (w4[0], w4[1], offset);
      w7[3] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[2] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[1] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w6[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w5[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w4[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w3[3] = hc_bytealign_be (    0, w0[0], offset);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_bytealign_be (w7[3],     0, offset);
      c3[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c3[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c3[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c3[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c2[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c2[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c2[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c2[0] = hc_bytealign_be (w5[3], w6[0], offset);
      c1[3] = hc_bytealign_be (w5[2], w5[3], offset);
      c1[2] = hc_bytealign_be (w5[1], w5[2], offset);
      c1[1] = hc_bytealign_be (w5[0], w5[1], offset);
      c1[0] = hc_bytealign_be (w4[3], w5[0], offset);
      c0[3] = hc_bytealign_be (w4[2], w4[3], offset);
      c0[2] = hc_bytealign_be (w4[1], w4[2], offset);
      c0[1] = hc_bytealign_be (w4[0], w4[1], offset);
      c0[0] = hc_bytealign_be (w3[3], w4[0], offset);
      w7[3] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[2] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[1] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w6[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w5[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w4[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[0] = hc_bytealign_be (    0, w0[0], offset);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_bytealign_be (w7[3],     0, offset);
      c4[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c3[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c3[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c3[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c3[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c2[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c2[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c2[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c2[0] = hc_bytealign_be (w5[2], w5[3], offset);
      c1[3] = hc_bytealign_be (w5[1], w5[2], offset);
      c1[2] = hc_bytealign_be (w5[0], w5[1], offset);
      c1[1] = hc_bytealign_be (w4[3], w5[0], offset);
      c1[0] = hc_bytealign_be (w4[2], w4[3], offset);
      c0[3] = hc_bytealign_be (w4[1], w4[2], offset);
      c0[2] = hc_bytealign_be (w4[0], w4[1], offset);
      c0[1] = hc_bytealign_be (w3[3], w4[0], offset);
      c0[0] = hc_bytealign_be (w3[2], w3[3], offset);
      w7[3] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[2] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[1] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w6[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w5[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w4[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[1] = hc_bytealign_be (    0, w0[0], offset);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_bytealign_be (w7[3],     0, offset);
      c4[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c4[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c3[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c3[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c3[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c3[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c2[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c2[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c2[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c2[0] = hc_bytealign_be (w5[1], w5[2], offset);
      c1[3] = hc_bytealign_be (w5[0], w5[1], offset);
      c1[2] = hc_bytealign_be (w4[3], w5[0], offset);
      c1[1] = hc_bytealign_be (w4[2], w4[3], offset);
      c1[0] = hc_bytealign_be (w4[1], w4[2], offset);
      c0[3] = hc_bytealign_be (w4[0], w4[1], offset);
      c0[2] = hc_bytealign_be (w3[3], w4[0], offset);
      c0[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[0] = hc_bytealign_be (w3[1], w3[2], offset);
      w7[3] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[2] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[1] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w6[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w5[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w4[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[2] = hc_bytealign_be (    0, w0[0], offset);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_bytealign_be (w7[3],     0, offset);
      c4[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c4[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c4[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c3[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c3[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c3[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c3[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c2[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c2[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c2[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c2[0] = hc_bytealign_be (w5[0], w5[1], offset);
      c1[3] = hc_bytealign_be (w4[3], w5[0], offset);
      c1[2] = hc_bytealign_be (w4[2], w4[3], offset);
      c1[1] = hc_bytealign_be (w4[1], w4[2], offset);
      c1[0] = hc_bytealign_be (w4[0], w4[1], offset);
      c0[3] = hc_bytealign_be (w3[3], w4[0], offset);
      c0[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[0] = hc_bytealign_be (w3[0], w3[1], offset);
      w7[3] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[2] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[1] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w6[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w5[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w4[3] = hc_bytealign_be (    0, w0[0], offset);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_bytealign_be (w7[3],     0, offset);
      c4[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c4[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c4[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c4[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c3[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c3[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c3[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c3[0] = hc_bytealign_be (w5[3], w6[0], offset);
      c2[3] = hc_bytealign_be (w5[2], w5[3], offset);
      c2[2] = hc_bytealign_be (w5[1], w5[2], offset);
      c2[1] = hc_bytealign_be (w5[0], w5[1], offset);
      c2[0] = hc_bytealign_be (w4[3], w5[0], offset);
      c1[3] = hc_bytealign_be (w4[2], w4[3], offset);
      c1[2] = hc_bytealign_be (w4[1], w4[2], offset);
      c1[1] = hc_bytealign_be (w4[0], w4[1], offset);
      c1[0] = hc_bytealign_be (w3[3], w4[0], offset);
      c0[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[0] = hc_bytealign_be (w2[3], w3[0], offset);
      w7[3] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[2] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[1] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w6[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w5[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[0] = hc_bytealign_be (    0, w0[0], offset);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_bytealign_be (w7[3],     0, offset);
      c5[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c4[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c4[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c4[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c4[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c3[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c3[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c3[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c3[0] = hc_bytealign_be (w5[2], w5[3], offset);
      c2[3] = hc_bytealign_be (w5[1], w5[2], offset);
      c2[2] = hc_bytealign_be (w5[0], w5[1], offset);
      c2[1] = hc_bytealign_be (w4[3], w5[0], offset);
      c2[0] = hc_bytealign_be (w4[2], w4[3], offset);
      c1[3] = hc_bytealign_be (w4[1], w4[2], offset);
      c1[2] = hc_bytealign_be (w4[0], w4[1], offset);
      c1[1] = hc_bytealign_be (w3[3], w4[0], offset);
      c1[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c0[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[0] = hc_bytealign_be (w2[2], w2[3], offset);
      w7[3] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[2] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[1] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w6[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w5[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[1] = hc_bytealign_be (    0, w0[0], offset);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_bytealign_be (w7[3],     0, offset);
      c5[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c5[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c4[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c4[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c4[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c4[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c3[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c3[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c3[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c3[0] = hc_bytealign_be (w5[1], w5[2], offset);
      c2[3] = hc_bytealign_be (w5[0], w5[1], offset);
      c2[2] = hc_bytealign_be (w4[3], w5[0], offset);
      c2[1] = hc_bytealign_be (w4[2], w4[3], offset);
      c2[0] = hc_bytealign_be (w4[1], w4[2], offset);
      c1[3] = hc_bytealign_be (w4[0], w4[1], offset);
      c1[2] = hc_bytealign_be (w3[3], w4[0], offset);
      c1[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c0[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[0] = hc_bytealign_be (w2[1], w2[2], offset);
      w7[3] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[2] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[1] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w6[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w5[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[2] = hc_bytealign_be (    0, w0[0], offset);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_bytealign_be (w7[3],     0, offset);
      c5[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c5[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c5[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c4[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c4[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c4[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c4[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c3[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c3[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c3[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c3[0] = hc_bytealign_be (w5[0], w5[1], offset);
      c2[3] = hc_bytealign_be (w4[3], w5[0], offset);
      c2[2] = hc_bytealign_be (w4[2], w4[3], offset);
      c2[1] = hc_bytealign_be (w4[1], w4[2], offset);
      c2[0] = hc_bytealign_be (w4[0], w4[1], offset);
      c1[3] = hc_bytealign_be (w3[3], w4[0], offset);
      c1[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c0[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[0] = hc_bytealign_be (w2[0], w2[1], offset);
      w7[3] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[2] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[1] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w6[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w5[3] = hc_bytealign_be (    0, w0[0], offset);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_bytealign_be (w7[3],     0, offset);
      c5[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c5[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c5[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c5[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c4[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c4[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c4[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c4[0] = hc_bytealign_be (w5[3], w6[0], offset);
      c3[3] = hc_bytealign_be (w5[2], w5[3], offset);
      c3[2] = hc_bytealign_be (w5[1], w5[2], offset);
      c3[1] = hc_bytealign_be (w5[0], w5[1], offset);
      c3[0] = hc_bytealign_be (w4[3], w5[0], offset);
      c2[3] = hc_bytealign_be (w4[2], w4[3], offset);
      c2[2] = hc_bytealign_be (w4[1], w4[2], offset);
      c2[1] = hc_bytealign_be (w4[0], w4[1], offset);
      c2[0] = hc_bytealign_be (w3[3], w4[0], offset);
      c1[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[0] = hc_bytealign_be (w2[3], w3[0], offset);
      c0[3] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[2] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[1] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[0] = hc_bytealign_be (w1[3], w2[0], offset);
      w7[3] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[2] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[1] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w6[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[0] = hc_bytealign_be (    0, w0[0], offset);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_bytealign_be (w7[3],     0, offset);
      c6[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c5[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c5[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c5[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c5[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c4[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c4[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c4[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c4[0] = hc_bytealign_be (w5[2], w5[3], offset);
      c3[3] = hc_bytealign_be (w5[1], w5[2], offset);
      c3[2] = hc_bytealign_be (w5[0], w5[1], offset);
      c3[1] = hc_bytealign_be (w4[3], w5[0], offset);
      c3[0] = hc_bytealign_be (w4[2], w4[3], offset);
      c2[3] = hc_bytealign_be (w4[1], w4[2], offset);
      c2[2] = hc_bytealign_be (w4[0], w4[1], offset);
      c2[1] = hc_bytealign_be (w3[3], w4[0], offset);
      c2[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c1[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[0] = hc_bytealign_be (w2[2], w2[3], offset);
      c0[3] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[2] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[1] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[0] = hc_bytealign_be (w1[2], w1[3], offset);
      w7[3] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[2] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[1] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w6[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[1] = hc_bytealign_be (    0, w0[0], offset);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_bytealign_be (w7[3],     0, offset);
      c6[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c6[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c5[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c5[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c5[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c5[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c4[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c4[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c4[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c4[0] = hc_bytealign_be (w5[1], w5[2], offset);
      c3[3] = hc_bytealign_be (w5[0], w5[1], offset);
      c3[2] = hc_bytealign_be (w4[3], w5[0], offset);
      c3[1] = hc_bytealign_be (w4[2], w4[3], offset);
      c3[0] = hc_bytealign_be (w4[1], w4[2], offset);
      c2[3] = hc_bytealign_be (w4[0], w4[1], offset);
      c2[2] = hc_bytealign_be (w3[3], w4[0], offset);
      c2[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c1[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[0] = hc_bytealign_be (w2[1], w2[2], offset);
      c0[3] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[2] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[1] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[0] = hc_bytealign_be (w1[1], w1[2], offset);
      w7[3] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[2] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[1] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w6[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[2] = hc_bytealign_be (    0, w0[0], offset);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_bytealign_be (w7[3],     0, offset);
      c6[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c6[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c6[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c5[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c5[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c5[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c5[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c4[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c4[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c4[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c4[0] = hc_bytealign_be (w5[0], w5[1], offset);
      c3[3] = hc_bytealign_be (w4[3], w5[0], offset);
      c3[2] = hc_bytealign_be (w4[2], w4[3], offset);
      c3[1] = hc_bytealign_be (w4[1], w4[2], offset);
      c3[0] = hc_bytealign_be (w4[0], w4[1], offset);
      c2[3] = hc_bytealign_be (w3[3], w4[0], offset);
      c2[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c1[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[0] = hc_bytealign_be (w2[0], w2[1], offset);
      c0[3] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[2] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[1] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[0] = hc_bytealign_be (w1[0], w1[1], offset);
      w7[3] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[2] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[1] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w6[3] = hc_bytealign_be (    0, w0[0], offset);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_bytealign_be (w7[3],     0, offset);
      c6[3] = hc_bytealign_be (w7[2], w7[3], offset);
      c6[2] = hc_bytealign_be (w7[1], w7[2], offset);
      c6[1] = hc_bytealign_be (w7[0], w7[1], offset);
      c6[0] = hc_bytealign_be (w6[3], w7[0], offset);
      c5[3] = hc_bytealign_be (w6[2], w6[3], offset);
      c5[2] = hc_bytealign_be (w6[1], w6[2], offset);
      c5[1] = hc_bytealign_be (w6[0], w6[1], offset);
      c5[0] = hc_bytealign_be (w5[3], w6[0], offset);
      c4[3] = hc_bytealign_be (w5[2], w5[3], offset);
      c4[2] = hc_bytealign_be (w5[1], w5[2], offset);
      c4[1] = hc_bytealign_be (w5[0], w5[1], offset);
      c4[0] = hc_bytealign_be (w4[3], w5[0], offset);
      c3[3] = hc_bytealign_be (w4[2], w4[3], offset);
      c3[2] = hc_bytealign_be (w4[1], w4[2], offset);
      c3[1] = hc_bytealign_be (w4[0], w4[1], offset);
      c3[0] = hc_bytealign_be (w3[3], w4[0], offset);
      c2[3] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[2] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[1] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[0] = hc_bytealign_be (w2[3], w3[0], offset);
      c1[3] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[2] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[1] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[0] = hc_bytealign_be (w1[3], w2[0], offset);
      c0[3] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[2] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[1] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[0] = hc_bytealign_be (w0[3], w1[0], offset);
      w7[3] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[2] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[1] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[0] = hc_bytealign_be (    0, w0[0], offset);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_bytealign_be (w7[3],     0, offset);
      c7[0] = hc_bytealign_be (w7[2], w7[3], offset);
      c6[3] = hc_bytealign_be (w7[1], w7[2], offset);
      c6[2] = hc_bytealign_be (w7[0], w7[1], offset);
      c6[1] = hc_bytealign_be (w6[3], w7[0], offset);
      c6[0] = hc_bytealign_be (w6[2], w6[3], offset);
      c5[3] = hc_bytealign_be (w6[1], w6[2], offset);
      c5[2] = hc_bytealign_be (w6[0], w6[1], offset);
      c5[1] = hc_bytealign_be (w5[3], w6[0], offset);
      c5[0] = hc_bytealign_be (w5[2], w5[3], offset);
      c4[3] = hc_bytealign_be (w5[1], w5[2], offset);
      c4[2] = hc_bytealign_be (w5[0], w5[1], offset);
      c4[1] = hc_bytealign_be (w4[3], w5[0], offset);
      c4[0] = hc_bytealign_be (w4[2], w4[3], offset);
      c3[3] = hc_bytealign_be (w4[1], w4[2], offset);
      c3[2] = hc_bytealign_be (w4[0], w4[1], offset);
      c3[1] = hc_bytealign_be (w3[3], w4[0], offset);
      c3[0] = hc_bytealign_be (w3[2], w3[3], offset);
      c2[3] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[2] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[1] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[0] = hc_bytealign_be (w2[2], w2[3], offset);
      c1[3] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[2] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[1] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[0] = hc_bytealign_be (w1[2], w1[3], offset);
      c0[3] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[2] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[1] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[0] = hc_bytealign_be (w0[2], w0[3], offset);
      w7[3] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[2] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[1] = hc_bytealign_be (    0, w0[0], offset);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_bytealign_be (w7[3],     0, offset);
      c7[1] = hc_bytealign_be (w7[2], w7[3], offset);
      c7[0] = hc_bytealign_be (w7[1], w7[2], offset);
      c6[3] = hc_bytealign_be (w7[0], w7[1], offset);
      c6[2] = hc_bytealign_be (w6[3], w7[0], offset);
      c6[1] = hc_bytealign_be (w6[2], w6[3], offset);
      c6[0] = hc_bytealign_be (w6[1], w6[2], offset);
      c5[3] = hc_bytealign_be (w6[0], w6[1], offset);
      c5[2] = hc_bytealign_be (w5[3], w6[0], offset);
      c5[1] = hc_bytealign_be (w5[2], w5[3], offset);
      c5[0] = hc_bytealign_be (w5[1], w5[2], offset);
      c4[3] = hc_bytealign_be (w5[0], w5[1], offset);
      c4[2] = hc_bytealign_be (w4[3], w5[0], offset);
      c4[1] = hc_bytealign_be (w4[2], w4[3], offset);
      c4[0] = hc_bytealign_be (w4[1], w4[2], offset);
      c3[3] = hc_bytealign_be (w4[0], w4[1], offset);
      c3[2] = hc_bytealign_be (w3[3], w4[0], offset);
      c3[1] = hc_bytealign_be (w3[2], w3[3], offset);
      c3[0] = hc_bytealign_be (w3[1], w3[2], offset);
      c2[3] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[2] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[1] = hc_bytealign_be (w2[2], w2[3], offset);
      c2[0] = hc_bytealign_be (w2[1], w2[2], offset);
      c1[3] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[2] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[1] = hc_bytealign_be (w1[2], w1[3], offset);
      c1[0] = hc_bytealign_be (w1[1], w1[2], offset);
      c0[3] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[2] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[1] = hc_bytealign_be (w0[2], w0[3], offset);
      c0[0] = hc_bytealign_be (w0[1], w0[2], offset);
      w7[3] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[2] = hc_bytealign_be (    0, w0[0], offset);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_bytealign_be (w7[3],     0, offset);
      c7[2] = hc_bytealign_be (w7[2], w7[3], offset);
      c7[1] = hc_bytealign_be (w7[1], w7[2], offset);
      c7[0] = hc_bytealign_be (w7[0], w7[1], offset);
      c6[3] = hc_bytealign_be (w6[3], w7[0], offset);
      c6[2] = hc_bytealign_be (w6[2], w6[3], offset);
      c6[1] = hc_bytealign_be (w6[1], w6[2], offset);
      c6[0] = hc_bytealign_be (w6[0], w6[1], offset);
      c5[3] = hc_bytealign_be (w5[3], w6[0], offset);
      c5[2] = hc_bytealign_be (w5[2], w5[3], offset);
      c5[1] = hc_bytealign_be (w5[1], w5[2], offset);
      c5[0] = hc_bytealign_be (w5[0], w5[1], offset);
      c4[3] = hc_bytealign_be (w4[3], w5[0], offset);
      c4[2] = hc_bytealign_be (w4[2], w4[3], offset);
      c4[1] = hc_bytealign_be (w4[1], w4[2], offset);
      c4[0] = hc_bytealign_be (w4[0], w4[1], offset);
      c3[3] = hc_bytealign_be (w3[3], w4[0], offset);
      c3[2] = hc_bytealign_be (w3[2], w3[3], offset);
      c3[1] = hc_bytealign_be (w3[1], w3[2], offset);
      c3[0] = hc_bytealign_be (w3[0], w3[1], offset);
      c2[3] = hc_bytealign_be (w2[3], w3[0], offset);
      c2[2] = hc_bytealign_be (w2[2], w2[3], offset);
      c2[1] = hc_bytealign_be (w2[1], w2[2], offset);
      c2[0] = hc_bytealign_be (w2[0], w2[1], offset);
      c1[3] = hc_bytealign_be (w1[3], w2[0], offset);
      c1[2] = hc_bytealign_be (w1[2], w1[3], offset);
      c1[1] = hc_bytealign_be (w1[1], w1[2], offset);
      c1[0] = hc_bytealign_be (w1[0], w1[1], offset);
      c0[3] = hc_bytealign_be (w0[3], w1[0], offset);
      c0[2] = hc_bytealign_be (w0[2], w0[3], offset);
      c0[1] = hc_bytealign_be (w0[1], w0[2], offset);
      c0[0] = hc_bytealign_be (w0[0], w0[1], offset);
      w7[3] = hc_bytealign_be (    0, w0[0], offset);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  #if defined IS_NV
  const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      c0[0] = hc_byte_perm (    0, w7[3], selector);
      w7[3] = hc_byte_perm (w7[3], w7[2], selector);
      w7[2] = hc_byte_perm (w7[2], w7[1], selector);
      w7[1] = hc_byte_perm (w7[1], w7[0], selector);
      w7[0] = hc_byte_perm (w7[0], w6[3], selector);
      w6[3] = hc_byte_perm (w6[3], w6[2], selector);
      w6[2] = hc_byte_perm (w6[2], w6[1], selector);
      w6[1] = hc_byte_perm (w6[1], w6[0], selector);
      w6[0] = hc_byte_perm (w6[0], w5[3], selector);
      w5[3] = hc_byte_perm (w5[3], w5[2], selector);
      w5[2] = hc_byte_perm (w5[2], w5[1], selector);
      w5[1] = hc_byte_perm (w5[1], w5[0], selector);
      w5[0] = hc_byte_perm (w5[0], w4[3], selector);
      w4[3] = hc_byte_perm (w4[3], w4[2], selector);
      w4[2] = hc_byte_perm (w4[2], w4[1], selector);
      w4[1] = hc_byte_perm (w4[1], w4[0], selector);
      w4[0] = hc_byte_perm (w4[0], w3[3], selector);
      w3[3] = hc_byte_perm (w3[3], w3[2], selector);
      w3[2] = hc_byte_perm (w3[2], w3[1], selector);
      w3[1] = hc_byte_perm (w3[1], w3[0], selector);
      w3[0] = hc_byte_perm (w3[0], w2[3], selector);
      w2[3] = hc_byte_perm (w2[3], w2[2], selector);
      w2[2] = hc_byte_perm (w2[2], w2[1], selector);
      w2[1] = hc_byte_perm (w2[1], w2[0], selector);
      w2[0] = hc_byte_perm (w2[0], w1[3], selector);
      w1[3] = hc_byte_perm (w1[3], w1[2], selector);
      w1[2] = hc_byte_perm (w1[2], w1[1], selector);
      w1[1] = hc_byte_perm (w1[1], w1[0], selector);
      w1[0] = hc_byte_perm (w1[0], w0[3], selector);
      w0[3] = hc_byte_perm (w0[3], w0[2], selector);
      w0[2] = hc_byte_perm (w0[2], w0[1], selector);
      w0[1] = hc_byte_perm (w0[1], w0[0], selector);
      w0[0] = hc_byte_perm (w0[0],     0, selector);

      break;

    case  1:
      c0[1] = hc_byte_perm (    0, w7[3], selector);
      c0[0] = hc_byte_perm (w7[3], w7[2], selector);
      w7[3] = hc_byte_perm (w7[2], w7[1], selector);
      w7[2] = hc_byte_perm (w7[1], w7[0], selector);
      w7[1] = hc_byte_perm (w7[0], w6[3], selector);
      w7[0] = hc_byte_perm (w6[3], w6[2], selector);
      w6[3] = hc_byte_perm (w6[2], w6[1], selector);
      w6[2] = hc_byte_perm (w6[1], w6[0], selector);
      w6[1] = hc_byte_perm (w6[0], w5[3], selector);
      w6[0] = hc_byte_perm (w5[3], w5[2], selector);
      w5[3] = hc_byte_perm (w5[2], w5[1], selector);
      w5[2] = hc_byte_perm (w5[1], w5[0], selector);
      w5[1] = hc_byte_perm (w5[0], w4[3], selector);
      w5[0] = hc_byte_perm (w4[3], w4[2], selector);
      w4[3] = hc_byte_perm (w4[2], w4[1], selector);
      w4[2] = hc_byte_perm (w4[1], w4[0], selector);
      w4[1] = hc_byte_perm (w4[0], w3[3], selector);
      w4[0] = hc_byte_perm (w3[3], w3[2], selector);
      w3[3] = hc_byte_perm (w3[2], w3[1], selector);
      w3[2] = hc_byte_perm (w3[1], w3[0], selector);
      w3[1] = hc_byte_perm (w3[0], w2[3], selector);
      w3[0] = hc_byte_perm (w2[3], w2[2], selector);
      w2[3] = hc_byte_perm (w2[2], w2[1], selector);
      w2[2] = hc_byte_perm (w2[1], w2[0], selector);
      w2[1] = hc_byte_perm (w2[0], w1[3], selector);
      w2[0] = hc_byte_perm (w1[3], w1[2], selector);
      w1[3] = hc_byte_perm (w1[2], w1[1], selector);
      w1[2] = hc_byte_perm (w1[1], w1[0], selector);
      w1[1] = hc_byte_perm (w1[0], w0[3], selector);
      w1[0] = hc_byte_perm (w0[3], w0[2], selector);
      w0[3] = hc_byte_perm (w0[2], w0[1], selector);
      w0[2] = hc_byte_perm (w0[1], w0[0], selector);
      w0[1] = hc_byte_perm (w0[0],     0, selector);
      w0[0] = 0;

      break;

    case  2:
      c0[2] = hc_byte_perm (    0, w7[3], selector);
      c0[1] = hc_byte_perm (w7[3], w7[2], selector);
      c0[0] = hc_byte_perm (w7[2], w7[1], selector);
      w7[3] = hc_byte_perm (w7[1], w7[0], selector);
      w7[2] = hc_byte_perm (w7[0], w6[3], selector);
      w7[1] = hc_byte_perm (w6[3], w6[2], selector);
      w7[0] = hc_byte_perm (w6[2], w6[1], selector);
      w6[3] = hc_byte_perm (w6[1], w6[0], selector);
      w6[2] = hc_byte_perm (w6[0], w5[3], selector);
      w6[1] = hc_byte_perm (w5[3], w5[2], selector);
      w6[0] = hc_byte_perm (w5[2], w5[1], selector);
      w5[3] = hc_byte_perm (w5[1], w5[0], selector);
      w5[2] = hc_byte_perm (w5[0], w4[3], selector);
      w5[1] = hc_byte_perm (w4[3], w4[2], selector);
      w5[0] = hc_byte_perm (w4[2], w4[1], selector);
      w4[3] = hc_byte_perm (w4[1], w4[0], selector);
      w4[2] = hc_byte_perm (w4[0], w3[3], selector);
      w4[1] = hc_byte_perm (w3[3], w3[2], selector);
      w4[0] = hc_byte_perm (w3[2], w3[1], selector);
      w3[3] = hc_byte_perm (w3[1], w3[0], selector);
      w3[2] = hc_byte_perm (w3[0], w2[3], selector);
      w3[1] = hc_byte_perm (w2[3], w2[2], selector);
      w3[0] = hc_byte_perm (w2[2], w2[1], selector);
      w2[3] = hc_byte_perm (w2[1], w2[0], selector);
      w2[2] = hc_byte_perm (w2[0], w1[3], selector);
      w2[1] = hc_byte_perm (w1[3], w1[2], selector);
      w2[0] = hc_byte_perm (w1[2], w1[1], selector);
      w1[3] = hc_byte_perm (w1[1], w1[0], selector);
      w1[2] = hc_byte_perm (w1[0], w0[3], selector);
      w1[1] = hc_byte_perm (w0[3], w0[2], selector);
      w1[0] = hc_byte_perm (w0[2], w0[1], selector);
      w0[3] = hc_byte_perm (w0[1], w0[0], selector);
      w0[2] = hc_byte_perm (w0[0],     0, selector);
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  3:
      c0[3] = hc_byte_perm (    0, w7[3], selector);
      c0[2] = hc_byte_perm (w7[3], w7[2], selector);
      c0[1] = hc_byte_perm (w7[2], w7[1], selector);
      c0[0] = hc_byte_perm (w7[1], w7[0], selector);
      w7[3] = hc_byte_perm (w7[0], w6[3], selector);
      w7[2] = hc_byte_perm (w6[3], w6[2], selector);
      w7[1] = hc_byte_perm (w6[2], w6[1], selector);
      w7[0] = hc_byte_perm (w6[1], w6[0], selector);
      w6[3] = hc_byte_perm (w6[0], w5[3], selector);
      w6[2] = hc_byte_perm (w5[3], w5[2], selector);
      w6[1] = hc_byte_perm (w5[2], w5[1], selector);
      w6[0] = hc_byte_perm (w5[1], w5[0], selector);
      w5[3] = hc_byte_perm (w5[0], w4[3], selector);
      w5[2] = hc_byte_perm (w4[3], w4[2], selector);
      w5[1] = hc_byte_perm (w4[2], w4[1], selector);
      w5[0] = hc_byte_perm (w4[1], w4[0], selector);
      w4[3] = hc_byte_perm (w4[0], w3[3], selector);
      w4[2] = hc_byte_perm (w3[3], w3[2], selector);
      w4[1] = hc_byte_perm (w3[2], w3[1], selector);
      w4[0] = hc_byte_perm (w3[1], w3[0], selector);
      w3[3] = hc_byte_perm (w3[0], w2[3], selector);
      w3[2] = hc_byte_perm (w2[3], w2[2], selector);
      w3[1] = hc_byte_perm (w2[2], w2[1], selector);
      w3[0] = hc_byte_perm (w2[1], w2[0], selector);
      w2[3] = hc_byte_perm (w2[0], w1[3], selector);
      w2[2] = hc_byte_perm (w1[3], w1[2], selector);
      w2[1] = hc_byte_perm (w1[2], w1[1], selector);
      w2[0] = hc_byte_perm (w1[1], w1[0], selector);
      w1[3] = hc_byte_perm (w1[0], w0[3], selector);
      w1[2] = hc_byte_perm (w0[3], w0[2], selector);
      w1[1] = hc_byte_perm (w0[2], w0[1], selector);
      w1[0] = hc_byte_perm (w0[1], w0[0], selector);
      w0[3] = hc_byte_perm (w0[0],     0, selector);
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  4:
      c1[0] = hc_byte_perm (    0, w7[3], selector);
      c0[3] = hc_byte_perm (w7[3], w7[2], selector);
      c0[2] = hc_byte_perm (w7[2], w7[1], selector);
      c0[1] = hc_byte_perm (w7[1], w7[0], selector);
      c0[0] = hc_byte_perm (w7[0], w6[3], selector);
      w7[3] = hc_byte_perm (w6[3], w6[2], selector);
      w7[2] = hc_byte_perm (w6[2], w6[1], selector);
      w7[1] = hc_byte_perm (w6[1], w6[0], selector);
      w7[0] = hc_byte_perm (w6[0], w5[3], selector);
      w6[3] = hc_byte_perm (w5[3], w5[2], selector);
      w6[2] = hc_byte_perm (w5[2], w5[1], selector);
      w6[1] = hc_byte_perm (w5[1], w5[0], selector);
      w6[0] = hc_byte_perm (w5[0], w4[3], selector);
      w5[3] = hc_byte_perm (w4[3], w4[2], selector);
      w5[2] = hc_byte_perm (w4[2], w4[1], selector);
      w5[1] = hc_byte_perm (w4[1], w4[0], selector);
      w5[0] = hc_byte_perm (w4[0], w3[3], selector);
      w4[3] = hc_byte_perm (w3[3], w3[2], selector);
      w4[2] = hc_byte_perm (w3[2], w3[1], selector);
      w4[1] = hc_byte_perm (w3[1], w3[0], selector);
      w4[0] = hc_byte_perm (w3[0], w2[3], selector);
      w3[3] = hc_byte_perm (w2[3], w2[2], selector);
      w3[2] = hc_byte_perm (w2[2], w2[1], selector);
      w3[1] = hc_byte_perm (w2[1], w2[0], selector);
      w3[0] = hc_byte_perm (w2[0], w1[3], selector);
      w2[3] = hc_byte_perm (w1[3], w1[2], selector);
      w2[2] = hc_byte_perm (w1[2], w1[1], selector);
      w2[1] = hc_byte_perm (w1[1], w1[0], selector);
      w2[0] = hc_byte_perm (w1[0], w0[3], selector);
      w1[3] = hc_byte_perm (w0[3], w0[2], selector);
      w1[2] = hc_byte_perm (w0[2], w0[1], selector);
      w1[1] = hc_byte_perm (w0[1], w0[0], selector);
      w1[0] = hc_byte_perm (w0[0],     0, selector);
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  5:
      c1[1] = hc_byte_perm (    0, w7[3], selector);
      c1[0] = hc_byte_perm (w7[3], w7[2], selector);
      c0[3] = hc_byte_perm (w7[2], w7[1], selector);
      c0[2] = hc_byte_perm (w7[1], w7[0], selector);
      c0[1] = hc_byte_perm (w7[0], w6[3], selector);
      c0[0] = hc_byte_perm (w6[3], w6[2], selector);
      w7[3] = hc_byte_perm (w6[2], w6[1], selector);
      w7[2] = hc_byte_perm (w6[1], w6[0], selector);
      w7[1] = hc_byte_perm (w6[0], w5[3], selector);
      w7[0] = hc_byte_perm (w5[3], w5[2], selector);
      w6[3] = hc_byte_perm (w5[2], w5[1], selector);
      w6[2] = hc_byte_perm (w5[1], w5[0], selector);
      w6[1] = hc_byte_perm (w5[0], w4[3], selector);
      w6[0] = hc_byte_perm (w4[3], w4[2], selector);
      w5[3] = hc_byte_perm (w4[2], w4[1], selector);
      w5[2] = hc_byte_perm (w4[1], w4[0], selector);
      w5[1] = hc_byte_perm (w4[0], w3[3], selector);
      w5[0] = hc_byte_perm (w3[3], w3[2], selector);
      w4[3] = hc_byte_perm (w3[2], w3[1], selector);
      w4[2] = hc_byte_perm (w3[1], w3[0], selector);
      w4[1] = hc_byte_perm (w3[0], w2[3], selector);
      w4[0] = hc_byte_perm (w2[3], w2[2], selector);
      w3[3] = hc_byte_perm (w2[2], w2[1], selector);
      w3[2] = hc_byte_perm (w2[1], w2[0], selector);
      w3[1] = hc_byte_perm (w2[0], w1[3], selector);
      w3[0] = hc_byte_perm (w1[3], w1[2], selector);
      w2[3] = hc_byte_perm (w1[2], w1[1], selector);
      w2[2] = hc_byte_perm (w1[1], w1[0], selector);
      w2[1] = hc_byte_perm (w1[0], w0[3], selector);
      w2[0] = hc_byte_perm (w0[3], w0[2], selector);
      w1[3] = hc_byte_perm (w0[2], w0[1], selector);
      w1[2] = hc_byte_perm (w0[1], w0[0], selector);
      w1[1] = hc_byte_perm (w0[0],     0, selector);
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  6:
      c1[2] = hc_byte_perm (    0, w7[3], selector);
      c1[1] = hc_byte_perm (w7[3], w7[2], selector);
      c1[0] = hc_byte_perm (w7[2], w7[1], selector);
      c0[3] = hc_byte_perm (w7[1], w7[0], selector);
      c0[2] = hc_byte_perm (w7[0], w6[3], selector);
      c0[1] = hc_byte_perm (w6[3], w6[2], selector);
      c0[0] = hc_byte_perm (w6[2], w6[1], selector);
      w7[3] = hc_byte_perm (w6[1], w6[0], selector);
      w7[2] = hc_byte_perm (w6[0], w5[3], selector);
      w7[1] = hc_byte_perm (w5[3], w5[2], selector);
      w7[0] = hc_byte_perm (w5[2], w5[1], selector);
      w6[3] = hc_byte_perm (w5[1], w5[0], selector);
      w6[2] = hc_byte_perm (w5[0], w4[3], selector);
      w6[1] = hc_byte_perm (w4[3], w4[2], selector);
      w6[0] = hc_byte_perm (w4[2], w4[1], selector);
      w5[3] = hc_byte_perm (w4[1], w4[0], selector);
      w5[2] = hc_byte_perm (w4[0], w3[3], selector);
      w5[1] = hc_byte_perm (w3[3], w3[2], selector);
      w5[0] = hc_byte_perm (w3[2], w3[1], selector);
      w4[3] = hc_byte_perm (w3[1], w3[0], selector);
      w4[2] = hc_byte_perm (w3[0], w2[3], selector);
      w4[1] = hc_byte_perm (w2[3], w2[2], selector);
      w4[0] = hc_byte_perm (w2[2], w2[1], selector);
      w3[3] = hc_byte_perm (w2[1], w2[0], selector);
      w3[2] = hc_byte_perm (w2[0], w1[3], selector);
      w3[1] = hc_byte_perm (w1[3], w1[2], selector);
      w3[0] = hc_byte_perm (w1[2], w1[1], selector);
      w2[3] = hc_byte_perm (w1[1], w1[0], selector);
      w2[2] = hc_byte_perm (w1[0], w0[3], selector);
      w2[1] = hc_byte_perm (w0[3], w0[2], selector);
      w2[0] = hc_byte_perm (w0[2], w0[1], selector);
      w1[3] = hc_byte_perm (w0[1], w0[0], selector);
      w1[2] = hc_byte_perm (w0[0],     0, selector);
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  7:
      c1[3] = hc_byte_perm (    0, w7[3], selector);
      c1[2] = hc_byte_perm (w7[3], w7[2], selector);
      c1[1] = hc_byte_perm (w7[2], w7[1], selector);
      c1[0] = hc_byte_perm (w7[1], w7[0], selector);
      c0[3] = hc_byte_perm (w7[0], w6[3], selector);
      c0[2] = hc_byte_perm (w6[3], w6[2], selector);
      c0[1] = hc_byte_perm (w6[2], w6[1], selector);
      c0[0] = hc_byte_perm (w6[1], w6[0], selector);
      w7[3] = hc_byte_perm (w6[0], w5[3], selector);
      w7[2] = hc_byte_perm (w5[3], w5[2], selector);
      w7[1] = hc_byte_perm (w5[2], w5[1], selector);
      w7[0] = hc_byte_perm (w5[1], w5[0], selector);
      w6[3] = hc_byte_perm (w5[0], w4[3], selector);
      w6[2] = hc_byte_perm (w4[3], w4[2], selector);
      w6[1] = hc_byte_perm (w4[2], w4[1], selector);
      w6[0] = hc_byte_perm (w4[1], w4[0], selector);
      w5[3] = hc_byte_perm (w4[0], w3[3], selector);
      w5[2] = hc_byte_perm (w3[3], w3[2], selector);
      w5[1] = hc_byte_perm (w3[2], w3[1], selector);
      w5[0] = hc_byte_perm (w3[1], w3[0], selector);
      w4[3] = hc_byte_perm (w3[0], w2[3], selector);
      w4[2] = hc_byte_perm (w2[3], w2[2], selector);
      w4[1] = hc_byte_perm (w2[2], w2[1], selector);
      w4[0] = hc_byte_perm (w2[1], w2[0], selector);
      w3[3] = hc_byte_perm (w2[0], w1[3], selector);
      w3[2] = hc_byte_perm (w1[3], w1[2], selector);
      w3[1] = hc_byte_perm (w1[2], w1[1], selector);
      w3[0] = hc_byte_perm (w1[1], w1[0], selector);
      w2[3] = hc_byte_perm (w1[0], w0[3], selector);
      w2[2] = hc_byte_perm (w0[3], w0[2], selector);
      w2[1] = hc_byte_perm (w0[2], w0[1], selector);
      w2[0] = hc_byte_perm (w0[1], w0[0], selector);
      w1[3] = hc_byte_perm (w0[0],     0, selector);
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  8:
      c2[0] = hc_byte_perm (    0, w7[3], selector);
      c1[3] = hc_byte_perm (w7[3], w7[2], selector);
      c1[2] = hc_byte_perm (w7[2], w7[1], selector);
      c1[1] = hc_byte_perm (w7[1], w7[0], selector);
      c1[0] = hc_byte_perm (w7[0], w6[3], selector);
      c0[3] = hc_byte_perm (w6[3], w6[2], selector);
      c0[2] = hc_byte_perm (w6[2], w6[1], selector);
      c0[1] = hc_byte_perm (w6[1], w6[0], selector);
      c0[0] = hc_byte_perm (w6[0], w5[3], selector);
      w7[3] = hc_byte_perm (w5[3], w5[2], selector);
      w7[2] = hc_byte_perm (w5[2], w5[1], selector);
      w7[1] = hc_byte_perm (w5[1], w5[0], selector);
      w7[0] = hc_byte_perm (w5[0], w4[3], selector);
      w6[3] = hc_byte_perm (w4[3], w4[2], selector);
      w6[2] = hc_byte_perm (w4[2], w4[1], selector);
      w6[1] = hc_byte_perm (w4[1], w4[0], selector);
      w6[0] = hc_byte_perm (w4[0], w3[3], selector);
      w5[3] = hc_byte_perm (w3[3], w3[2], selector);
      w5[2] = hc_byte_perm (w3[2], w3[1], selector);
      w5[1] = hc_byte_perm (w3[1], w3[0], selector);
      w5[0] = hc_byte_perm (w3[0], w2[3], selector);
      w4[3] = hc_byte_perm (w2[3], w2[2], selector);
      w4[2] = hc_byte_perm (w2[2], w2[1], selector);
      w4[1] = hc_byte_perm (w2[1], w2[0], selector);
      w4[0] = hc_byte_perm (w2[0], w1[3], selector);
      w3[3] = hc_byte_perm (w1[3], w1[2], selector);
      w3[2] = hc_byte_perm (w1[2], w1[1], selector);
      w3[1] = hc_byte_perm (w1[1], w1[0], selector);
      w3[0] = hc_byte_perm (w1[0], w0[3], selector);
      w2[3] = hc_byte_perm (w0[3], w0[2], selector);
      w2[2] = hc_byte_perm (w0[2], w0[1], selector);
      w2[1] = hc_byte_perm (w0[1], w0[0], selector);
      w2[0] = hc_byte_perm (w0[0],     0, selector);
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case  9:
      c2[1] = hc_byte_perm (    0, w7[3], selector);
      c2[0] = hc_byte_perm (w7[3], w7[2], selector);
      c1[3] = hc_byte_perm (w7[2], w7[1], selector);
      c1[2] = hc_byte_perm (w7[1], w7[0], selector);
      c1[1] = hc_byte_perm (w7[0], w6[3], selector);
      c1[0] = hc_byte_perm (w6[3], w6[2], selector);
      c0[3] = hc_byte_perm (w6[2], w6[1], selector);
      c0[2] = hc_byte_perm (w6[1], w6[0], selector);
      c0[1] = hc_byte_perm (w6[0], w5[3], selector);
      c0[0] = hc_byte_perm (w5[3], w5[2], selector);
      w7[3] = hc_byte_perm (w5[2], w5[1], selector);
      w7[2] = hc_byte_perm (w5[1], w5[0], selector);
      w7[1] = hc_byte_perm (w5[0], w4[3], selector);
      w7[0] = hc_byte_perm (w4[3], w4[2], selector);
      w6[3] = hc_byte_perm (w4[2], w4[1], selector);
      w6[2] = hc_byte_perm (w4[1], w4[0], selector);
      w6[1] = hc_byte_perm (w4[0], w3[3], selector);
      w6[0] = hc_byte_perm (w3[3], w3[2], selector);
      w5[3] = hc_byte_perm (w3[2], w3[1], selector);
      w5[2] = hc_byte_perm (w3[1], w3[0], selector);
      w5[1] = hc_byte_perm (w3[0], w2[3], selector);
      w5[0] = hc_byte_perm (w2[3], w2[2], selector);
      w4[3] = hc_byte_perm (w2[2], w2[1], selector);
      w4[2] = hc_byte_perm (w2[1], w2[0], selector);
      w4[1] = hc_byte_perm (w2[0], w1[3], selector);
      w4[0] = hc_byte_perm (w1[3], w1[2], selector);
      w3[3] = hc_byte_perm (w1[2], w1[1], selector);
      w3[2] = hc_byte_perm (w1[1], w1[0], selector);
      w3[1] = hc_byte_perm (w1[0], w0[3], selector);
      w3[0] = hc_byte_perm (w0[3], w0[2], selector);
      w2[3] = hc_byte_perm (w0[2], w0[1], selector);
      w2[2] = hc_byte_perm (w0[1], w0[0], selector);
      w2[1] = hc_byte_perm (w0[0],     0, selector);
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 10:
      c2[2] = hc_byte_perm (    0, w7[3], selector);
      c2[1] = hc_byte_perm (w7[3], w7[2], selector);
      c2[0] = hc_byte_perm (w7[2], w7[1], selector);
      c1[3] = hc_byte_perm (w7[1], w7[0], selector);
      c1[2] = hc_byte_perm (w7[0], w6[3], selector);
      c1[1] = hc_byte_perm (w6[3], w6[2], selector);
      c1[0] = hc_byte_perm (w6[2], w6[1], selector);
      c0[3] = hc_byte_perm (w6[1], w6[0], selector);
      c0[2] = hc_byte_perm (w6[0], w5[3], selector);
      c0[1] = hc_byte_perm (w5[3], w5[2], selector);
      c0[0] = hc_byte_perm (w5[2], w5[1], selector);
      w7[3] = hc_byte_perm (w5[1], w5[0], selector);
      w7[2] = hc_byte_perm (w5[0], w4[3], selector);
      w7[1] = hc_byte_perm (w4[3], w4[2], selector);
      w7[0] = hc_byte_perm (w4[2], w4[1], selector);
      w6[3] = hc_byte_perm (w4[1], w4[0], selector);
      w6[2] = hc_byte_perm (w4[0], w3[3], selector);
      w6[1] = hc_byte_perm (w3[3], w3[2], selector);
      w6[0] = hc_byte_perm (w3[2], w3[1], selector);
      w5[3] = hc_byte_perm (w3[1], w3[0], selector);
      w5[2] = hc_byte_perm (w3[0], w2[3], selector);
      w5[1] = hc_byte_perm (w2[3], w2[2], selector);
      w5[0] = hc_byte_perm (w2[2], w2[1], selector);
      w4[3] = hc_byte_perm (w2[1], w2[0], selector);
      w4[2] = hc_byte_perm (w2[0], w1[3], selector);
      w4[1] = hc_byte_perm (w1[3], w1[2], selector);
      w4[0] = hc_byte_perm (w1[2], w1[1], selector);
      w3[3] = hc_byte_perm (w1[1], w1[0], selector);
      w3[2] = hc_byte_perm (w1[0], w0[3], selector);
      w3[1] = hc_byte_perm (w0[3], w0[2], selector);
      w3[0] = hc_byte_perm (w0[2], w0[1], selector);
      w2[3] = hc_byte_perm (w0[1], w0[0], selector);
      w2[2] = hc_byte_perm (w0[0],     0, selector);
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 11:
      c2[3] = hc_byte_perm (    0, w7[3], selector);
      c2[2] = hc_byte_perm (w7[3], w7[2], selector);
      c2[1] = hc_byte_perm (w7[2], w7[1], selector);
      c2[0] = hc_byte_perm (w7[1], w7[0], selector);
      c1[3] = hc_byte_perm (w7[0], w6[3], selector);
      c1[2] = hc_byte_perm (w6[3], w6[2], selector);
      c1[1] = hc_byte_perm (w6[2], w6[1], selector);
      c1[0] = hc_byte_perm (w6[1], w6[0], selector);
      c0[3] = hc_byte_perm (w6[0], w5[3], selector);
      c0[2] = hc_byte_perm (w5[3], w5[2], selector);
      c0[1] = hc_byte_perm (w5[2], w5[1], selector);
      c0[0] = hc_byte_perm (w5[1], w5[0], selector);
      w7[3] = hc_byte_perm (w5[0], w4[3], selector);
      w7[2] = hc_byte_perm (w4[3], w4[2], selector);
      w7[1] = hc_byte_perm (w4[2], w4[1], selector);
      w7[0] = hc_byte_perm (w4[1], w4[0], selector);
      w6[3] = hc_byte_perm (w4[0], w3[3], selector);
      w6[2] = hc_byte_perm (w3[3], w3[2], selector);
      w6[1] = hc_byte_perm (w3[2], w3[1], selector);
      w6[0] = hc_byte_perm (w3[1], w3[0], selector);
      w5[3] = hc_byte_perm (w3[0], w2[3], selector);
      w5[2] = hc_byte_perm (w2[3], w2[2], selector);
      w5[1] = hc_byte_perm (w2[2], w2[1], selector);
      w5[0] = hc_byte_perm (w2[1], w2[0], selector);
      w4[3] = hc_byte_perm (w2[0], w1[3], selector);
      w4[2] = hc_byte_perm (w1[3], w1[2], selector);
      w4[1] = hc_byte_perm (w1[2], w1[1], selector);
      w4[0] = hc_byte_perm (w1[1], w1[0], selector);
      w3[3] = hc_byte_perm (w1[0], w0[3], selector);
      w3[2] = hc_byte_perm (w0[3], w0[2], selector);
      w3[1] = hc_byte_perm (w0[2], w0[1], selector);
      w3[0] = hc_byte_perm (w0[1], w0[0], selector);
      w2[3] = hc_byte_perm (w0[0],     0, selector);
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 12:
      c3[0] = hc_byte_perm (    0, w7[3], selector);
      c2[3] = hc_byte_perm (w7[3], w7[2], selector);
      c2[2] = hc_byte_perm (w7[2], w7[1], selector);
      c2[1] = hc_byte_perm (w7[1], w7[0], selector);
      c2[0] = hc_byte_perm (w7[0], w6[3], selector);
      c1[3] = hc_byte_perm (w6[3], w6[2], selector);
      c1[2] = hc_byte_perm (w6[2], w6[1], selector);
      c1[1] = hc_byte_perm (w6[1], w6[0], selector);
      c1[0] = hc_byte_perm (w6[0], w5[3], selector);
      c0[3] = hc_byte_perm (w5[3], w5[2], selector);
      c0[2] = hc_byte_perm (w5[2], w5[1], selector);
      c0[1] = hc_byte_perm (w5[1], w5[0], selector);
      c0[0] = hc_byte_perm (w5[0], w4[3], selector);
      w7[3] = hc_byte_perm (w4[3], w4[2], selector);
      w7[2] = hc_byte_perm (w4[2], w4[1], selector);
      w7[1] = hc_byte_perm (w4[1], w4[0], selector);
      w7[0] = hc_byte_perm (w4[0], w3[3], selector);
      w6[3] = hc_byte_perm (w3[3], w3[2], selector);
      w6[2] = hc_byte_perm (w3[2], w3[1], selector);
      w6[1] = hc_byte_perm (w3[1], w3[0], selector);
      w6[0] = hc_byte_perm (w3[0], w2[3], selector);
      w5[3] = hc_byte_perm (w2[3], w2[2], selector);
      w5[2] = hc_byte_perm (w2[2], w2[1], selector);
      w5[1] = hc_byte_perm (w2[1], w2[0], selector);
      w5[0] = hc_byte_perm (w2[0], w1[3], selector);
      w4[3] = hc_byte_perm (w1[3], w1[2], selector);
      w4[2] = hc_byte_perm (w1[2], w1[1], selector);
      w4[1] = hc_byte_perm (w1[1], w1[0], selector);
      w4[0] = hc_byte_perm (w1[0], w0[3], selector);
      w3[3] = hc_byte_perm (w0[3], w0[2], selector);
      w3[2] = hc_byte_perm (w0[2], w0[1], selector);
      w3[1] = hc_byte_perm (w0[1], w0[0], selector);
      w3[0] = hc_byte_perm (w0[0],     0, selector);
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 13:
      c3[1] = hc_byte_perm (    0, w7[3], selector);
      c3[0] = hc_byte_perm (w7[3], w7[2], selector);
      c2[3] = hc_byte_perm (w7[2], w7[1], selector);
      c2[2] = hc_byte_perm (w7[1], w7[0], selector);
      c2[1] = hc_byte_perm (w7[0], w6[3], selector);
      c2[0] = hc_byte_perm (w6[3], w6[2], selector);
      c1[3] = hc_byte_perm (w6[2], w6[1], selector);
      c1[2] = hc_byte_perm (w6[1], w6[0], selector);
      c1[1] = hc_byte_perm (w6[0], w5[3], selector);
      c1[0] = hc_byte_perm (w5[3], w5[2], selector);
      c0[3] = hc_byte_perm (w5[2], w5[1], selector);
      c0[2] = hc_byte_perm (w5[1], w5[0], selector);
      c0[1] = hc_byte_perm (w5[0], w4[3], selector);
      c0[0] = hc_byte_perm (w4[3], w4[2], selector);
      w7[3] = hc_byte_perm (w4[2], w4[1], selector);
      w7[2] = hc_byte_perm (w4[1], w4[0], selector);
      w7[1] = hc_byte_perm (w4[0], w3[3], selector);
      w7[0] = hc_byte_perm (w3[3], w3[2], selector);
      w6[3] = hc_byte_perm (w3[2], w3[1], selector);
      w6[2] = hc_byte_perm (w3[1], w3[0], selector);
      w6[1] = hc_byte_perm (w3[0], w2[3], selector);
      w6[0] = hc_byte_perm (w2[3], w2[2], selector);
      w5[3] = hc_byte_perm (w2[2], w2[1], selector);
      w5[2] = hc_byte_perm (w2[1], w2[0], selector);
      w5[1] = hc_byte_perm (w2[0], w1[3], selector);
      w5[0] = hc_byte_perm (w1[3], w1[2], selector);
      w4[3] = hc_byte_perm (w1[2], w1[1], selector);
      w4[2] = hc_byte_perm (w1[1], w1[0], selector);
      w4[1] = hc_byte_perm (w1[0], w0[3], selector);
      w4[0] = hc_byte_perm (w0[3], w0[2], selector);
      w3[3] = hc_byte_perm (w0[2], w0[1], selector);
      w3[2] = hc_byte_perm (w0[1], w0[0], selector);
      w3[1] = hc_byte_perm (w0[0],     0, selector);
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 14:
      c3[2] = hc_byte_perm (    0, w7[3], selector);
      c3[1] = hc_byte_perm (w7[3], w7[2], selector);
      c3[0] = hc_byte_perm (w7[2], w7[1], selector);
      c2[3] = hc_byte_perm (w7[1], w7[0], selector);
      c2[2] = hc_byte_perm (w7[0], w6[3], selector);
      c2[1] = hc_byte_perm (w6[3], w6[2], selector);
      c2[0] = hc_byte_perm (w6[2], w6[1], selector);
      c1[3] = hc_byte_perm (w6[1], w6[0], selector);
      c1[2] = hc_byte_perm (w6[0], w5[3], selector);
      c1[1] = hc_byte_perm (w5[3], w5[2], selector);
      c1[0] = hc_byte_perm (w5[2], w5[1], selector);
      c0[3] = hc_byte_perm (w5[1], w5[0], selector);
      c0[2] = hc_byte_perm (w5[0], w4[3], selector);
      c0[1] = hc_byte_perm (w4[3], w4[2], selector);
      c0[0] = hc_byte_perm (w4[2], w4[1], selector);
      w7[3] = hc_byte_perm (w4[1], w4[0], selector);
      w7[2] = hc_byte_perm (w4[0], w3[3], selector);
      w7[1] = hc_byte_perm (w3[3], w3[2], selector);
      w7[0] = hc_byte_perm (w3[2], w3[1], selector);
      w6[3] = hc_byte_perm (w3[1], w3[0], selector);
      w6[2] = hc_byte_perm (w3[0], w2[3], selector);
      w6[1] = hc_byte_perm (w2[3], w2[2], selector);
      w6[0] = hc_byte_perm (w2[2], w2[1], selector);
      w5[3] = hc_byte_perm (w2[1], w2[0], selector);
      w5[2] = hc_byte_perm (w2[0], w1[3], selector);
      w5[1] = hc_byte_perm (w1[3], w1[2], selector);
      w5[0] = hc_byte_perm (w1[2], w1[1], selector);
      w4[3] = hc_byte_perm (w1[1], w1[0], selector);
      w4[2] = hc_byte_perm (w1[0], w0[3], selector);
      w4[1] = hc_byte_perm (w0[3], w0[2], selector);
      w4[0] = hc_byte_perm (w0[2], w0[1], selector);
      w3[3] = hc_byte_perm (w0[1], w0[0], selector);
      w3[2] = hc_byte_perm (w0[0],     0, selector);
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 15:
      c3[3] = hc_byte_perm (    0, w7[3], selector);
      c3[2] = hc_byte_perm (w7[3], w7[2], selector);
      c3[1] = hc_byte_perm (w7[2], w7[1], selector);
      c3[0] = hc_byte_perm (w7[1], w7[0], selector);
      c2[3] = hc_byte_perm (w7[0], w6[3], selector);
      c2[2] = hc_byte_perm (w6[3], w6[2], selector);
      c2[1] = hc_byte_perm (w6[2], w6[1], selector);
      c2[0] = hc_byte_perm (w6[1], w6[0], selector);
      c1[3] = hc_byte_perm (w6[0], w5[3], selector);
      c1[2] = hc_byte_perm (w5[3], w5[2], selector);
      c1[1] = hc_byte_perm (w5[2], w5[1], selector);
      c1[0] = hc_byte_perm (w5[1], w5[0], selector);
      c0[3] = hc_byte_perm (w5[0], w4[3], selector);
      c0[2] = hc_byte_perm (w4[3], w4[2], selector);
      c0[1] = hc_byte_perm (w4[2], w4[1], selector);
      c0[0] = hc_byte_perm (w4[1], w4[0], selector);
      w7[3] = hc_byte_perm (w4[0], w3[3], selector);
      w7[2] = hc_byte_perm (w3[3], w3[2], selector);
      w7[1] = hc_byte_perm (w3[2], w3[1], selector);
      w7[0] = hc_byte_perm (w3[1], w3[0], selector);
      w6[3] = hc_byte_perm (w3[0], w2[3], selector);
      w6[2] = hc_byte_perm (w2[3], w2[2], selector);
      w6[1] = hc_byte_perm (w2[2], w2[1], selector);
      w6[0] = hc_byte_perm (w2[1], w2[0], selector);
      w5[3] = hc_byte_perm (w2[0], w1[3], selector);
      w5[2] = hc_byte_perm (w1[3], w1[2], selector);
      w5[1] = hc_byte_perm (w1[2], w1[1], selector);
      w5[0] = hc_byte_perm (w1[1], w1[0], selector);
      w4[3] = hc_byte_perm (w1[0], w0[3], selector);
      w4[2] = hc_byte_perm (w0[3], w0[2], selector);
      w4[1] = hc_byte_perm (w0[2], w0[1], selector);
      w4[0] = hc_byte_perm (w0[1], w0[0], selector);
      w3[3] = hc_byte_perm (w0[0],     0, selector);
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 16:
      c4[0] = hc_byte_perm (    0, w7[3], selector);
      c3[3] = hc_byte_perm (w7[3], w7[2], selector);
      c3[2] = hc_byte_perm (w7[2], w7[1], selector);
      c3[1] = hc_byte_perm (w7[1], w7[0], selector);
      c3[0] = hc_byte_perm (w7[0], w6[3], selector);
      c2[3] = hc_byte_perm (w6[3], w6[2], selector);
      c2[2] = hc_byte_perm (w6[2], w6[1], selector);
      c2[1] = hc_byte_perm (w6[1], w6[0], selector);
      c2[0] = hc_byte_perm (w6[0], w5[3], selector);
      c1[3] = hc_byte_perm (w5[3], w5[2], selector);
      c1[2] = hc_byte_perm (w5[2], w5[1], selector);
      c1[1] = hc_byte_perm (w5[1], w5[0], selector);
      c1[0] = hc_byte_perm (w5[0], w4[3], selector);
      c0[3] = hc_byte_perm (w4[3], w4[2], selector);
      c0[2] = hc_byte_perm (w4[2], w4[1], selector);
      c0[1] = hc_byte_perm (w4[1], w4[0], selector);
      c0[0] = hc_byte_perm (w4[0], w3[3], selector);
      w7[3] = hc_byte_perm (w3[3], w3[2], selector);
      w7[2] = hc_byte_perm (w3[2], w3[1], selector);
      w7[1] = hc_byte_perm (w3[1], w3[0], selector);
      w7[0] = hc_byte_perm (w3[0], w2[3], selector);
      w6[3] = hc_byte_perm (w2[3], w2[2], selector);
      w6[2] = hc_byte_perm (w2[2], w2[1], selector);
      w6[1] = hc_byte_perm (w2[1], w2[0], selector);
      w6[0] = hc_byte_perm (w2[0], w1[3], selector);
      w5[3] = hc_byte_perm (w1[3], w1[2], selector);
      w5[2] = hc_byte_perm (w1[2], w1[1], selector);
      w5[1] = hc_byte_perm (w1[1], w1[0], selector);
      w5[0] = hc_byte_perm (w1[0], w0[3], selector);
      w4[3] = hc_byte_perm (w0[3], w0[2], selector);
      w4[2] = hc_byte_perm (w0[2], w0[1], selector);
      w4[1] = hc_byte_perm (w0[1], w0[0], selector);
      w4[0] = hc_byte_perm (w0[0],     0, selector);
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 17:
      c4[1] = hc_byte_perm (    0, w7[3], selector);
      c4[0] = hc_byte_perm (w7[3], w7[2], selector);
      c3[3] = hc_byte_perm (w7[2], w7[1], selector);
      c3[2] = hc_byte_perm (w7[1], w7[0], selector);
      c3[1] = hc_byte_perm (w7[0], w6[3], selector);
      c3[0] = hc_byte_perm (w6[3], w6[2], selector);
      c2[3] = hc_byte_perm (w6[2], w6[1], selector);
      c2[2] = hc_byte_perm (w6[1], w6[0], selector);
      c2[1] = hc_byte_perm (w6[0], w5[3], selector);
      c2[0] = hc_byte_perm (w5[3], w5[2], selector);
      c1[3] = hc_byte_perm (w5[2], w5[1], selector);
      c1[2] = hc_byte_perm (w5[1], w5[0], selector);
      c1[1] = hc_byte_perm (w5[0], w4[3], selector);
      c1[0] = hc_byte_perm (w4[3], w4[2], selector);
      c0[3] = hc_byte_perm (w4[2], w4[1], selector);
      c0[2] = hc_byte_perm (w4[1], w4[0], selector);
      c0[1] = hc_byte_perm (w4[0], w3[3], selector);
      c0[0] = hc_byte_perm (w3[3], w3[2], selector);
      w7[3] = hc_byte_perm (w3[2], w3[1], selector);
      w7[2] = hc_byte_perm (w3[1], w3[0], selector);
      w7[1] = hc_byte_perm (w3[0], w2[3], selector);
      w7[0] = hc_byte_perm (w2[3], w2[2], selector);
      w6[3] = hc_byte_perm (w2[2], w2[1], selector);
      w6[2] = hc_byte_perm (w2[1], w2[0], selector);
      w6[1] = hc_byte_perm (w2[0], w1[3], selector);
      w6[0] = hc_byte_perm (w1[3], w1[2], selector);
      w5[3] = hc_byte_perm (w1[2], w1[1], selector);
      w5[2] = hc_byte_perm (w1[1], w1[0], selector);
      w5[1] = hc_byte_perm (w1[0], w0[3], selector);
      w5[0] = hc_byte_perm (w0[3], w0[2], selector);
      w4[3] = hc_byte_perm (w0[2], w0[1], selector);
      w4[2] = hc_byte_perm (w0[1], w0[0], selector);
      w4[1] = hc_byte_perm (w0[0],     0, selector);
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 18:
      c4[2] = hc_byte_perm (    0, w7[3], selector);
      c4[1] = hc_byte_perm (w7[3], w7[2], selector);
      c4[0] = hc_byte_perm (w7[2], w7[1], selector);
      c3[3] = hc_byte_perm (w7[1], w7[0], selector);
      c3[2] = hc_byte_perm (w7[0], w6[3], selector);
      c3[1] = hc_byte_perm (w6[3], w6[2], selector);
      c3[0] = hc_byte_perm (w6[2], w6[1], selector);
      c2[3] = hc_byte_perm (w6[1], w6[0], selector);
      c2[2] = hc_byte_perm (w6[0], w5[3], selector);
      c2[1] = hc_byte_perm (w5[3], w5[2], selector);
      c2[0] = hc_byte_perm (w5[2], w5[1], selector);
      c1[3] = hc_byte_perm (w5[1], w5[0], selector);
      c1[2] = hc_byte_perm (w5[0], w4[3], selector);
      c1[1] = hc_byte_perm (w4[3], w4[2], selector);
      c1[0] = hc_byte_perm (w4[2], w4[1], selector);
      c0[3] = hc_byte_perm (w4[1], w4[0], selector);
      c0[2] = hc_byte_perm (w4[0], w3[3], selector);
      c0[1] = hc_byte_perm (w3[3], w3[2], selector);
      c0[0] = hc_byte_perm (w3[2], w3[1], selector);
      w7[3] = hc_byte_perm (w3[1], w3[0], selector);
      w7[2] = hc_byte_perm (w3[0], w2[3], selector);
      w7[1] = hc_byte_perm (w2[3], w2[2], selector);
      w7[0] = hc_byte_perm (w2[2], w2[1], selector);
      w6[3] = hc_byte_perm (w2[1], w2[0], selector);
      w6[2] = hc_byte_perm (w2[0], w1[3], selector);
      w6[1] = hc_byte_perm (w1[3], w1[2], selector);
      w6[0] = hc_byte_perm (w1[2], w1[1], selector);
      w5[3] = hc_byte_perm (w1[1], w1[0], selector);
      w5[2] = hc_byte_perm (w1[0], w0[3], selector);
      w5[1] = hc_byte_perm (w0[3], w0[2], selector);
      w5[0] = hc_byte_perm (w0[2], w0[1], selector);
      w4[3] = hc_byte_perm (w0[1], w0[0], selector);
      w4[2] = hc_byte_perm (w0[0],     0, selector);
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 19:
      c4[3] = hc_byte_perm (    0, w7[3], selector);
      c4[2] = hc_byte_perm (w7[3], w7[2], selector);
      c4[1] = hc_byte_perm (w7[2], w7[1], selector);
      c4[0] = hc_byte_perm (w7[1], w7[0], selector);
      c3[3] = hc_byte_perm (w7[0], w6[3], selector);
      c3[2] = hc_byte_perm (w6[3], w6[2], selector);
      c3[1] = hc_byte_perm (w6[2], w6[1], selector);
      c3[0] = hc_byte_perm (w6[1], w6[0], selector);
      c2[3] = hc_byte_perm (w6[0], w5[3], selector);
      c2[2] = hc_byte_perm (w5[3], w5[2], selector);
      c2[1] = hc_byte_perm (w5[2], w5[1], selector);
      c2[0] = hc_byte_perm (w5[1], w5[0], selector);
      c1[3] = hc_byte_perm (w5[0], w4[3], selector);
      c1[2] = hc_byte_perm (w4[3], w4[2], selector);
      c1[1] = hc_byte_perm (w4[2], w4[1], selector);
      c1[0] = hc_byte_perm (w4[1], w4[0], selector);
      c0[3] = hc_byte_perm (w4[0], w3[3], selector);
      c0[2] = hc_byte_perm (w3[3], w3[2], selector);
      c0[1] = hc_byte_perm (w3[2], w3[1], selector);
      c0[0] = hc_byte_perm (w3[1], w3[0], selector);
      w7[3] = hc_byte_perm (w3[0], w2[3], selector);
      w7[2] = hc_byte_perm (w2[3], w2[2], selector);
      w7[1] = hc_byte_perm (w2[2], w2[1], selector);
      w7[0] = hc_byte_perm (w2[1], w2[0], selector);
      w6[3] = hc_byte_perm (w2[0], w1[3], selector);
      w6[2] = hc_byte_perm (w1[3], w1[2], selector);
      w6[1] = hc_byte_perm (w1[2], w1[1], selector);
      w6[0] = hc_byte_perm (w1[1], w1[0], selector);
      w5[3] = hc_byte_perm (w1[0], w0[3], selector);
      w5[2] = hc_byte_perm (w0[3], w0[2], selector);
      w5[1] = hc_byte_perm (w0[2], w0[1], selector);
      w5[0] = hc_byte_perm (w0[1], w0[0], selector);
      w4[3] = hc_byte_perm (w0[0],     0, selector);
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 20:
      c5[0] = hc_byte_perm (    0, w7[3], selector);
      c4[3] = hc_byte_perm (w7[3], w7[2], selector);
      c4[2] = hc_byte_perm (w7[2], w7[1], selector);
      c4[1] = hc_byte_perm (w7[1], w7[0], selector);
      c4[0] = hc_byte_perm (w7[0], w6[3], selector);
      c3[3] = hc_byte_perm (w6[3], w6[2], selector);
      c3[2] = hc_byte_perm (w6[2], w6[1], selector);
      c3[1] = hc_byte_perm (w6[1], w6[0], selector);
      c3[0] = hc_byte_perm (w6[0], w5[3], selector);
      c2[3] = hc_byte_perm (w5[3], w5[2], selector);
      c2[2] = hc_byte_perm (w5[2], w5[1], selector);
      c2[1] = hc_byte_perm (w5[1], w5[0], selector);
      c2[0] = hc_byte_perm (w5[0], w4[3], selector);
      c1[3] = hc_byte_perm (w4[3], w4[2], selector);
      c1[2] = hc_byte_perm (w4[2], w4[1], selector);
      c1[1] = hc_byte_perm (w4[1], w4[0], selector);
      c1[0] = hc_byte_perm (w4[0], w3[3], selector);
      c0[3] = hc_byte_perm (w3[3], w3[2], selector);
      c0[2] = hc_byte_perm (w3[2], w3[1], selector);
      c0[1] = hc_byte_perm (w3[1], w3[0], selector);
      c0[0] = hc_byte_perm (w3[0], w2[3], selector);
      w7[3] = hc_byte_perm (w2[3], w2[2], selector);
      w7[2] = hc_byte_perm (w2[2], w2[1], selector);
      w7[1] = hc_byte_perm (w2[1], w2[0], selector);
      w7[0] = hc_byte_perm (w2[0], w1[3], selector);
      w6[3] = hc_byte_perm (w1[3], w1[2], selector);
      w6[2] = hc_byte_perm (w1[2], w1[1], selector);
      w6[1] = hc_byte_perm (w1[1], w1[0], selector);
      w6[0] = hc_byte_perm (w1[0], w0[3], selector);
      w5[3] = hc_byte_perm (w0[3], w0[2], selector);
      w5[2] = hc_byte_perm (w0[2], w0[1], selector);
      w5[1] = hc_byte_perm (w0[1], w0[0], selector);
      w5[0] = hc_byte_perm (w0[0],     0, selector);
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 21:
      c5[1] = hc_byte_perm (    0, w7[3], selector);
      c5[0] = hc_byte_perm (w7[3], w7[2], selector);
      c4[3] = hc_byte_perm (w7[2], w7[1], selector);
      c4[2] = hc_byte_perm (w7[1], w7[0], selector);
      c4[1] = hc_byte_perm (w7[0], w6[3], selector);
      c4[0] = hc_byte_perm (w6[3], w6[2], selector);
      c3[3] = hc_byte_perm (w6[2], w6[1], selector);
      c3[2] = hc_byte_perm (w6[1], w6[0], selector);
      c3[1] = hc_byte_perm (w6[0], w5[3], selector);
      c3[0] = hc_byte_perm (w5[3], w5[2], selector);
      c2[3] = hc_byte_perm (w5[2], w5[1], selector);
      c2[2] = hc_byte_perm (w5[1], w5[0], selector);
      c2[1] = hc_byte_perm (w5[0], w4[3], selector);
      c2[0] = hc_byte_perm (w4[3], w4[2], selector);
      c1[3] = hc_byte_perm (w4[2], w4[1], selector);
      c1[2] = hc_byte_perm (w4[1], w4[0], selector);
      c1[1] = hc_byte_perm (w4[0], w3[3], selector);
      c1[0] = hc_byte_perm (w3[3], w3[2], selector);
      c0[3] = hc_byte_perm (w3[2], w3[1], selector);
      c0[2] = hc_byte_perm (w3[1], w3[0], selector);
      c0[1] = hc_byte_perm (w3[0], w2[3], selector);
      c0[0] = hc_byte_perm (w2[3], w2[2], selector);
      w7[3] = hc_byte_perm (w2[2], w2[1], selector);
      w7[2] = hc_byte_perm (w2[1], w2[0], selector);
      w7[1] = hc_byte_perm (w2[0], w1[3], selector);
      w7[0] = hc_byte_perm (w1[3], w1[2], selector);
      w6[3] = hc_byte_perm (w1[2], w1[1], selector);
      w6[2] = hc_byte_perm (w1[1], w1[0], selector);
      w6[1] = hc_byte_perm (w1[0], w0[3], selector);
      w6[0] = hc_byte_perm (w0[3], w0[2], selector);
      w5[3] = hc_byte_perm (w0[2], w0[1], selector);
      w5[2] = hc_byte_perm (w0[1], w0[0], selector);
      w5[1] = hc_byte_perm (w0[0],     0, selector);
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 22:
      c5[2] = hc_byte_perm (    0, w7[3], selector);
      c5[1] = hc_byte_perm (w7[3], w7[2], selector);
      c5[0] = hc_byte_perm (w7[2], w7[1], selector);
      c4[3] = hc_byte_perm (w7[1], w7[0], selector);
      c4[2] = hc_byte_perm (w7[0], w6[3], selector);
      c4[1] = hc_byte_perm (w6[3], w6[2], selector);
      c4[0] = hc_byte_perm (w6[2], w6[1], selector);
      c3[3] = hc_byte_perm (w6[1], w6[0], selector);
      c3[2] = hc_byte_perm (w6[0], w5[3], selector);
      c3[1] = hc_byte_perm (w5[3], w5[2], selector);
      c3[0] = hc_byte_perm (w5[2], w5[1], selector);
      c2[3] = hc_byte_perm (w5[1], w5[0], selector);
      c2[2] = hc_byte_perm (w5[0], w4[3], selector);
      c2[1] = hc_byte_perm (w4[3], w4[2], selector);
      c2[0] = hc_byte_perm (w4[2], w4[1], selector);
      c1[3] = hc_byte_perm (w4[1], w4[0], selector);
      c1[2] = hc_byte_perm (w4[0], w3[3], selector);
      c1[1] = hc_byte_perm (w3[3], w3[2], selector);
      c1[0] = hc_byte_perm (w3[2], w3[1], selector);
      c0[3] = hc_byte_perm (w3[1], w3[0], selector);
      c0[2] = hc_byte_perm (w3[0], w2[3], selector);
      c0[1] = hc_byte_perm (w2[3], w2[2], selector);
      c0[0] = hc_byte_perm (w2[2], w2[1], selector);
      w7[3] = hc_byte_perm (w2[1], w2[0], selector);
      w7[2] = hc_byte_perm (w2[0], w1[3], selector);
      w7[1] = hc_byte_perm (w1[3], w1[2], selector);
      w7[0] = hc_byte_perm (w1[2], w1[1], selector);
      w6[3] = hc_byte_perm (w1[1], w1[0], selector);
      w6[2] = hc_byte_perm (w1[0], w0[3], selector);
      w6[1] = hc_byte_perm (w0[3], w0[2], selector);
      w6[0] = hc_byte_perm (w0[2], w0[1], selector);
      w5[3] = hc_byte_perm (w0[1], w0[0], selector);
      w5[2] = hc_byte_perm (w0[0],     0, selector);
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 23:
      c5[3] = hc_byte_perm (    0, w7[3], selector);
      c5[2] = hc_byte_perm (w7[3], w7[2], selector);
      c5[1] = hc_byte_perm (w7[2], w7[1], selector);
      c5[0] = hc_byte_perm (w7[1], w7[0], selector);
      c4[3] = hc_byte_perm (w7[0], w6[3], selector);
      c4[2] = hc_byte_perm (w6[3], w6[2], selector);
      c4[1] = hc_byte_perm (w6[2], w6[1], selector);
      c4[0] = hc_byte_perm (w6[1], w6[0], selector);
      c3[3] = hc_byte_perm (w6[0], w5[3], selector);
      c3[2] = hc_byte_perm (w5[3], w5[2], selector);
      c3[1] = hc_byte_perm (w5[2], w5[1], selector);
      c3[0] = hc_byte_perm (w5[1], w5[0], selector);
      c2[3] = hc_byte_perm (w5[0], w4[3], selector);
      c2[2] = hc_byte_perm (w4[3], w4[2], selector);
      c2[1] = hc_byte_perm (w4[2], w4[1], selector);
      c2[0] = hc_byte_perm (w4[1], w4[0], selector);
      c1[3] = hc_byte_perm (w4[0], w3[3], selector);
      c1[2] = hc_byte_perm (w3[3], w3[2], selector);
      c1[1] = hc_byte_perm (w3[2], w3[1], selector);
      c1[0] = hc_byte_perm (w3[1], w3[0], selector);
      c0[3] = hc_byte_perm (w3[0], w2[3], selector);
      c0[2] = hc_byte_perm (w2[3], w2[2], selector);
      c0[1] = hc_byte_perm (w2[2], w2[1], selector);
      c0[0] = hc_byte_perm (w2[1], w2[0], selector);
      w7[3] = hc_byte_perm (w2[0], w1[3], selector);
      w7[2] = hc_byte_perm (w1[3], w1[2], selector);
      w7[1] = hc_byte_perm (w1[2], w1[1], selector);
      w7[0] = hc_byte_perm (w1[1], w1[0], selector);
      w6[3] = hc_byte_perm (w1[0], w0[3], selector);
      w6[2] = hc_byte_perm (w0[3], w0[2], selector);
      w6[1] = hc_byte_perm (w0[2], w0[1], selector);
      w6[0] = hc_byte_perm (w0[1], w0[0], selector);
      w5[3] = hc_byte_perm (w0[0],     0, selector);
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 24:
      c6[0] = hc_byte_perm (    0, w7[3], selector);
      c5[3] = hc_byte_perm (w7[3], w7[2], selector);
      c5[2] = hc_byte_perm (w7[2], w7[1], selector);
      c5[1] = hc_byte_perm (w7[1], w7[0], selector);
      c5[0] = hc_byte_perm (w7[0], w6[3], selector);
      c4[3] = hc_byte_perm (w6[3], w6[2], selector);
      c4[2] = hc_byte_perm (w6[2], w6[1], selector);
      c4[1] = hc_byte_perm (w6[1], w6[0], selector);
      c4[0] = hc_byte_perm (w6[0], w5[3], selector);
      c3[3] = hc_byte_perm (w5[3], w5[2], selector);
      c3[2] = hc_byte_perm (w5[2], w5[1], selector);
      c3[1] = hc_byte_perm (w5[1], w5[0], selector);
      c3[0] = hc_byte_perm (w5[0], w4[3], selector);
      c2[3] = hc_byte_perm (w4[3], w4[2], selector);
      c2[2] = hc_byte_perm (w4[2], w4[1], selector);
      c2[1] = hc_byte_perm (w4[1], w4[0], selector);
      c2[0] = hc_byte_perm (w4[0], w3[3], selector);
      c1[3] = hc_byte_perm (w3[3], w3[2], selector);
      c1[2] = hc_byte_perm (w3[2], w3[1], selector);
      c1[1] = hc_byte_perm (w3[1], w3[0], selector);
      c1[0] = hc_byte_perm (w3[0], w2[3], selector);
      c0[3] = hc_byte_perm (w2[3], w2[2], selector);
      c0[2] = hc_byte_perm (w2[2], w2[1], selector);
      c0[1] = hc_byte_perm (w2[1], w2[0], selector);
      c0[0] = hc_byte_perm (w2[0], w1[3], selector);
      w7[3] = hc_byte_perm (w1[3], w1[2], selector);
      w7[2] = hc_byte_perm (w1[2], w1[1], selector);
      w7[1] = hc_byte_perm (w1[1], w1[0], selector);
      w7[0] = hc_byte_perm (w1[0], w0[3], selector);
      w6[3] = hc_byte_perm (w0[3], w0[2], selector);
      w6[2] = hc_byte_perm (w0[2], w0[1], selector);
      w6[1] = hc_byte_perm (w0[1], w0[0], selector);
      w6[0] = hc_byte_perm (w0[0],     0, selector);
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 25:
      c6[1] = hc_byte_perm (    0, w7[3], selector);
      c6[0] = hc_byte_perm (w7[3], w7[2], selector);
      c5[3] = hc_byte_perm (w7[2], w7[1], selector);
      c5[2] = hc_byte_perm (w7[1], w7[0], selector);
      c5[1] = hc_byte_perm (w7[0], w6[3], selector);
      c5[0] = hc_byte_perm (w6[3], w6[2], selector);
      c4[3] = hc_byte_perm (w6[2], w6[1], selector);
      c4[2] = hc_byte_perm (w6[1], w6[0], selector);
      c4[1] = hc_byte_perm (w6[0], w5[3], selector);
      c4[0] = hc_byte_perm (w5[3], w5[2], selector);
      c3[3] = hc_byte_perm (w5[2], w5[1], selector);
      c3[2] = hc_byte_perm (w5[1], w5[0], selector);
      c3[1] = hc_byte_perm (w5[0], w4[3], selector);
      c3[0] = hc_byte_perm (w4[3], w4[2], selector);
      c2[3] = hc_byte_perm (w4[2], w4[1], selector);
      c2[2] = hc_byte_perm (w4[1], w4[0], selector);
      c2[1] = hc_byte_perm (w4[0], w3[3], selector);
      c2[0] = hc_byte_perm (w3[3], w3[2], selector);
      c1[3] = hc_byte_perm (w3[2], w3[1], selector);
      c1[2] = hc_byte_perm (w3[1], w3[0], selector);
      c1[1] = hc_byte_perm (w3[0], w2[3], selector);
      c1[0] = hc_byte_perm (w2[3], w2[2], selector);
      c0[3] = hc_byte_perm (w2[2], w2[1], selector);
      c0[2] = hc_byte_perm (w2[1], w2[0], selector);
      c0[1] = hc_byte_perm (w2[0], w1[3], selector);
      c0[0] = hc_byte_perm (w1[3], w1[2], selector);
      w7[3] = hc_byte_perm (w1[2], w1[1], selector);
      w7[2] = hc_byte_perm (w1[1], w1[0], selector);
      w7[1] = hc_byte_perm (w1[0], w0[3], selector);
      w7[0] = hc_byte_perm (w0[3], w0[2], selector);
      w6[3] = hc_byte_perm (w0[2], w0[1], selector);
      w6[2] = hc_byte_perm (w0[1], w0[0], selector);
      w6[1] = hc_byte_perm (w0[0],     0, selector);
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 26:
      c6[2] = hc_byte_perm (    0, w7[3], selector);
      c6[1] = hc_byte_perm (w7[3], w7[2], selector);
      c6[0] = hc_byte_perm (w7[2], w7[1], selector);
      c5[3] = hc_byte_perm (w7[1], w7[0], selector);
      c5[2] = hc_byte_perm (w7[0], w6[3], selector);
      c5[1] = hc_byte_perm (w6[3], w6[2], selector);
      c5[0] = hc_byte_perm (w6[2], w6[1], selector);
      c4[3] = hc_byte_perm (w6[1], w6[0], selector);
      c4[2] = hc_byte_perm (w6[0], w5[3], selector);
      c4[1] = hc_byte_perm (w5[3], w5[2], selector);
      c4[0] = hc_byte_perm (w5[2], w5[1], selector);
      c3[3] = hc_byte_perm (w5[1], w5[0], selector);
      c3[2] = hc_byte_perm (w5[0], w4[3], selector);
      c3[1] = hc_byte_perm (w4[3], w4[2], selector);
      c3[0] = hc_byte_perm (w4[2], w4[1], selector);
      c2[3] = hc_byte_perm (w4[1], w4[0], selector);
      c2[2] = hc_byte_perm (w4[0], w3[3], selector);
      c2[1] = hc_byte_perm (w3[3], w3[2], selector);
      c2[0] = hc_byte_perm (w3[2], w3[1], selector);
      c1[3] = hc_byte_perm (w3[1], w3[0], selector);
      c1[2] = hc_byte_perm (w3[0], w2[3], selector);
      c1[1] = hc_byte_perm (w2[3], w2[2], selector);
      c1[0] = hc_byte_perm (w2[2], w2[1], selector);
      c0[3] = hc_byte_perm (w2[1], w2[0], selector);
      c0[2] = hc_byte_perm (w2[0], w1[3], selector);
      c0[1] = hc_byte_perm (w1[3], w1[2], selector);
      c0[0] = hc_byte_perm (w1[2], w1[1], selector);
      w7[3] = hc_byte_perm (w1[1], w1[0], selector);
      w7[2] = hc_byte_perm (w1[0], w0[3], selector);
      w7[1] = hc_byte_perm (w0[3], w0[2], selector);
      w7[0] = hc_byte_perm (w0[2], w0[1], selector);
      w6[3] = hc_byte_perm (w0[1], w0[0], selector);
      w6[2] = hc_byte_perm (w0[0],     0, selector);
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 27:
      c6[3] = hc_byte_perm (    0, w7[3], selector);
      c6[2] = hc_byte_perm (w7[3], w7[2], selector);
      c6[1] = hc_byte_perm (w7[2], w7[1], selector);
      c6[0] = hc_byte_perm (w7[1], w7[0], selector);
      c5[3] = hc_byte_perm (w7[0], w6[3], selector);
      c5[2] = hc_byte_perm (w6[3], w6[2], selector);
      c5[1] = hc_byte_perm (w6[2], w6[1], selector);
      c5[0] = hc_byte_perm (w6[1], w6[0], selector);
      c4[3] = hc_byte_perm (w6[0], w5[3], selector);
      c4[2] = hc_byte_perm (w5[3], w5[2], selector);
      c4[1] = hc_byte_perm (w5[2], w5[1], selector);
      c4[0] = hc_byte_perm (w5[1], w5[0], selector);
      c3[3] = hc_byte_perm (w5[0], w4[3], selector);
      c3[2] = hc_byte_perm (w4[3], w4[2], selector);
      c3[1] = hc_byte_perm (w4[2], w4[1], selector);
      c3[0] = hc_byte_perm (w4[1], w4[0], selector);
      c2[3] = hc_byte_perm (w4[0], w3[3], selector);
      c2[2] = hc_byte_perm (w3[3], w3[2], selector);
      c2[1] = hc_byte_perm (w3[2], w3[1], selector);
      c2[0] = hc_byte_perm (w3[1], w3[0], selector);
      c1[3] = hc_byte_perm (w3[0], w2[3], selector);
      c1[2] = hc_byte_perm (w2[3], w2[2], selector);
      c1[1] = hc_byte_perm (w2[2], w2[1], selector);
      c1[0] = hc_byte_perm (w2[1], w2[0], selector);
      c0[3] = hc_byte_perm (w2[0], w1[3], selector);
      c0[2] = hc_byte_perm (w1[3], w1[2], selector);
      c0[1] = hc_byte_perm (w1[2], w1[1], selector);
      c0[0] = hc_byte_perm (w1[1], w1[0], selector);
      w7[3] = hc_byte_perm (w1[0], w0[3], selector);
      w7[2] = hc_byte_perm (w0[3], w0[2], selector);
      w7[1] = hc_byte_perm (w0[2], w0[1], selector);
      w7[0] = hc_byte_perm (w0[1], w0[0], selector);
      w6[3] = hc_byte_perm (w0[0],     0, selector);
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 28:
      c7[0] = hc_byte_perm (    0, w7[3], selector);
      c6[3] = hc_byte_perm (w7[3], w7[2], selector);
      c6[2] = hc_byte_perm (w7[2], w7[1], selector);
      c6[1] = hc_byte_perm (w7[1], w7[0], selector);
      c6[0] = hc_byte_perm (w7[0], w6[3], selector);
      c5[3] = hc_byte_perm (w6[3], w6[2], selector);
      c5[2] = hc_byte_perm (w6[2], w6[1], selector);
      c5[1] = hc_byte_perm (w6[1], w6[0], selector);
      c5[0] = hc_byte_perm (w6[0], w5[3], selector);
      c4[3] = hc_byte_perm (w5[3], w5[2], selector);
      c4[2] = hc_byte_perm (w5[2], w5[1], selector);
      c4[1] = hc_byte_perm (w5[1], w5[0], selector);
      c4[0] = hc_byte_perm (w5[0], w4[3], selector);
      c3[3] = hc_byte_perm (w4[3], w4[2], selector);
      c3[2] = hc_byte_perm (w4[2], w4[1], selector);
      c3[1] = hc_byte_perm (w4[1], w4[0], selector);
      c3[0] = hc_byte_perm (w4[0], w3[3], selector);
      c2[3] = hc_byte_perm (w3[3], w3[2], selector);
      c2[2] = hc_byte_perm (w3[2], w3[1], selector);
      c2[1] = hc_byte_perm (w3[1], w3[0], selector);
      c2[0] = hc_byte_perm (w3[0], w2[3], selector);
      c1[3] = hc_byte_perm (w2[3], w2[2], selector);
      c1[2] = hc_byte_perm (w2[2], w2[1], selector);
      c1[1] = hc_byte_perm (w2[1], w2[0], selector);
      c1[0] = hc_byte_perm (w2[0], w1[3], selector);
      c0[3] = hc_byte_perm (w1[3], w1[2], selector);
      c0[2] = hc_byte_perm (w1[2], w1[1], selector);
      c0[1] = hc_byte_perm (w1[1], w1[0], selector);
      c0[0] = hc_byte_perm (w1[0], w0[3], selector);
      w7[3] = hc_byte_perm (w0[3], w0[2], selector);
      w7[2] = hc_byte_perm (w0[2], w0[1], selector);
      w7[1] = hc_byte_perm (w0[1], w0[0], selector);
      w7[0] = hc_byte_perm (w0[0],     0, selector);
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 29:
      c7[1] = hc_byte_perm (    0, w7[3], selector);
      c7[0] = hc_byte_perm (w7[3], w7[2], selector);
      c6[3] = hc_byte_perm (w7[2], w7[1], selector);
      c6[2] = hc_byte_perm (w7[1], w7[0], selector);
      c6[1] = hc_byte_perm (w7[0], w6[3], selector);
      c6[0] = hc_byte_perm (w6[3], w6[2], selector);
      c5[3] = hc_byte_perm (w6[2], w6[1], selector);
      c5[2] = hc_byte_perm (w6[1], w6[0], selector);
      c5[1] = hc_byte_perm (w6[0], w5[3], selector);
      c5[0] = hc_byte_perm (w5[3], w5[2], selector);
      c4[3] = hc_byte_perm (w5[2], w5[1], selector);
      c4[2] = hc_byte_perm (w5[1], w5[0], selector);
      c4[1] = hc_byte_perm (w5[0], w4[3], selector);
      c4[0] = hc_byte_perm (w4[3], w4[2], selector);
      c3[3] = hc_byte_perm (w4[2], w4[1], selector);
      c3[2] = hc_byte_perm (w4[1], w4[0], selector);
      c3[1] = hc_byte_perm (w4[0], w3[3], selector);
      c3[0] = hc_byte_perm (w3[3], w3[2], selector);
      c2[3] = hc_byte_perm (w3[2], w3[1], selector);
      c2[2] = hc_byte_perm (w3[1], w3[0], selector);
      c2[1] = hc_byte_perm (w3[0], w2[3], selector);
      c2[0] = hc_byte_perm (w2[3], w2[2], selector);
      c1[3] = hc_byte_perm (w2[2], w2[1], selector);
      c1[2] = hc_byte_perm (w2[1], w2[0], selector);
      c1[1] = hc_byte_perm (w2[0], w1[3], selector);
      c1[0] = hc_byte_perm (w1[3], w1[2], selector);
      c0[3] = hc_byte_perm (w1[2], w1[1], selector);
      c0[2] = hc_byte_perm (w1[1], w1[0], selector);
      c0[1] = hc_byte_perm (w1[0], w0[3], selector);
      c0[0] = hc_byte_perm (w0[3], w0[2], selector);
      w7[3] = hc_byte_perm (w0[2], w0[1], selector);
      w7[2] = hc_byte_perm (w0[1], w0[0], selector);
      w7[1] = hc_byte_perm (w0[0],     0, selector);
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 30:
      c7[2] = hc_byte_perm (    0, w7[3], selector);
      c7[1] = hc_byte_perm (w7[3], w7[2], selector);
      c7[0] = hc_byte_perm (w7[2], w7[1], selector);
      c6[3] = hc_byte_perm (w7[1], w7[0], selector);
      c6[2] = hc_byte_perm (w7[0], w6[3], selector);
      c6[1] = hc_byte_perm (w6[3], w6[2], selector);
      c6[0] = hc_byte_perm (w6[2], w6[1], selector);
      c5[3] = hc_byte_perm (w6[1], w6[0], selector);
      c5[2] = hc_byte_perm (w6[0], w5[3], selector);
      c5[1] = hc_byte_perm (w5[3], w5[2], selector);
      c5[0] = hc_byte_perm (w5[2], w5[1], selector);
      c4[3] = hc_byte_perm (w5[1], w5[0], selector);
      c4[2] = hc_byte_perm (w5[0], w4[3], selector);
      c4[1] = hc_byte_perm (w4[3], w4[2], selector);
      c4[0] = hc_byte_perm (w4[2], w4[1], selector);
      c3[3] = hc_byte_perm (w4[1], w4[0], selector);
      c3[2] = hc_byte_perm (w4[0], w3[3], selector);
      c3[1] = hc_byte_perm (w3[3], w3[2], selector);
      c3[0] = hc_byte_perm (w3[2], w3[1], selector);
      c2[3] = hc_byte_perm (w3[1], w3[0], selector);
      c2[2] = hc_byte_perm (w3[0], w2[3], selector);
      c2[1] = hc_byte_perm (w2[3], w2[2], selector);
      c2[0] = hc_byte_perm (w2[2], w2[1], selector);
      c1[3] = hc_byte_perm (w2[1], w2[0], selector);
      c1[2] = hc_byte_perm (w2[0], w1[3], selector);
      c1[1] = hc_byte_perm (w1[3], w1[2], selector);
      c1[0] = hc_byte_perm (w1[2], w1[1], selector);
      c0[3] = hc_byte_perm (w1[1], w1[0], selector);
      c0[2] = hc_byte_perm (w1[0], w0[3], selector);
      c0[1] = hc_byte_perm (w0[3], w0[2], selector);
      c0[0] = hc_byte_perm (w0[2], w0[1], selector);
      w7[3] = hc_byte_perm (w0[1], w0[0], selector);
      w7[2] = hc_byte_perm (w0[0],     0, selector);
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;

    case 31:
      c7[3] = hc_byte_perm (    0, w7[3], selector);
      c7[2] = hc_byte_perm (w7[3], w7[2], selector);
      c7[1] = hc_byte_perm (w7[2], w7[1], selector);
      c7[0] = hc_byte_perm (w7[1], w7[0], selector);
      c6[3] = hc_byte_perm (w7[0], w6[3], selector);
      c6[2] = hc_byte_perm (w6[3], w6[2], selector);
      c6[1] = hc_byte_perm (w6[2], w6[1], selector);
      c6[0] = hc_byte_perm (w6[1], w6[0], selector);
      c5[3] = hc_byte_perm (w6[0], w5[3], selector);
      c5[2] = hc_byte_perm (w5[3], w5[2], selector);
      c5[1] = hc_byte_perm (w5[2], w5[1], selector);
      c5[0] = hc_byte_perm (w5[1], w5[0], selector);
      c4[3] = hc_byte_perm (w5[0], w4[3], selector);
      c4[2] = hc_byte_perm (w4[3], w4[2], selector);
      c4[1] = hc_byte_perm (w4[2], w4[1], selector);
      c4[0] = hc_byte_perm (w4[1], w4[0], selector);
      c3[3] = hc_byte_perm (w4[0], w3[3], selector);
      c3[2] = hc_byte_perm (w3[3], w3[2], selector);
      c3[1] = hc_byte_perm (w3[2], w3[1], selector);
      c3[0] = hc_byte_perm (w3[1], w3[0], selector);
      c2[3] = hc_byte_perm (w3[0], w2[3], selector);
      c2[2] = hc_byte_perm (w2[3], w2[2], selector);
      c2[1] = hc_byte_perm (w2[2], w2[1], selector);
      c2[0] = hc_byte_perm (w2[1], w2[0], selector);
      c1[3] = hc_byte_perm (w2[0], w1[3], selector);
      c1[2] = hc_byte_perm (w1[3], w1[2], selector);
      c1[1] = hc_byte_perm (w1[2], w1[1], selector);
      c1[0] = hc_byte_perm (w1[1], w1[0], selector);
      c0[3] = hc_byte_perm (w1[0], w0[3], selector);
      c0[2] = hc_byte_perm (w0[3], w0[2], selector);
      c0[1] = hc_byte_perm (w0[2], w0[1], selector);
      c0[0] = hc_byte_perm (w0[1], w0[0], selector);
      w7[3] = hc_byte_perm (w0[0],     0, selector);
      w7[2] = 0;
      w7[1] = 0;
      w7[0] = 0;
      w6[3] = 0;
      w6[2] = 0;
      w6[1] = 0;
      w6[0] = 0;
      w5[3] = 0;
      w5[2] = 0;
      w5[1] = 0;
      w5[0] = 0;
      w4[3] = 0;
      w4[2] = 0;
      w4[1] = 0;
      w4[0] = 0;
      w3[3] = 0;
      w3[2] = 0;
      w3[1] = 0;
      w3[0] = 0;
      w2[3] = 0;
      w2[2] = 0;
      w2[1] = 0;
      w2[0] = 0;
      w1[3] = 0;
      w1[2] = 0;
      w1[1] = 0;
      w1[0] = 0;
      w0[3] = 0;
      w0[2] = 0;
      w0[1] = 0;
      w0[0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign (w[62], w[63], offset);
      w[62] = hc_bytealign (w[61], w[62], offset);
      w[61] = hc_bytealign (w[60], w[61], offset);
      w[60] = hc_bytealign (w[59], w[60], offset);
      w[59] = hc_bytealign (w[58], w[59], offset);
      w[58] = hc_bytealign (w[57], w[58], offset);
      w[57] = hc_bytealign (w[56], w[57], offset);
      w[56] = hc_bytealign (w[55], w[56], offset);
      w[55] = hc_bytealign (w[54], w[55], offset);
      w[54] = hc_bytealign (w[53], w[54], offset);
      w[53] = hc_bytealign (w[52], w[53], offset);
      w[52] = hc_bytealign (w[51], w[52], offset);
      w[51] = hc_bytealign (w[50], w[51], offset);
      w[50] = hc_bytealign (w[49], w[50], offset);
      w[49] = hc_bytealign (w[48], w[49], offset);
      w[48] = hc_bytealign (w[47], w[48], offset);
      w[47] = hc_bytealign (w[46], w[47], offset);
      w[46] = hc_bytealign (w[45], w[46], offset);
      w[45] = hc_bytealign (w[44], w[45], offset);
      w[44] = hc_bytealign (w[43], w[44], offset);
      w[43] = hc_bytealign (w[42], w[43], offset);
      w[42] = hc_bytealign (w[41], w[42], offset);
      w[41] = hc_bytealign (w[40], w[41], offset);
      w[40] = hc_bytealign (w[39], w[40], offset);
      w[39] = hc_bytealign (w[38], w[39], offset);
      w[38] = hc_bytealign (w[37], w[38], offset);
      w[37] = hc_bytealign (w[36], w[37], offset);
      w[36] = hc_bytealign (w[35], w[36], offset);
      w[35] = hc_bytealign (w[34], w[35], offset);
      w[34] = hc_bytealign (w[33], w[34], offset);
      w[33] = hc_bytealign (w[32], w[33], offset);
      w[32] = hc_bytealign (w[31], w[32], offset);
      w[31] = hc_bytealign (w[30], w[31], offset);
      w[30] = hc_bytealign (w[29], w[30], offset);
      w[29] = hc_bytealign (w[28], w[29], offset);
      w[28] = hc_bytealign (w[27], w[28], offset);
      w[27] = hc_bytealign (w[26], w[27], offset);
      w[26] = hc_bytealign (w[25], w[26], offset);
      w[25] = hc_bytealign (w[24], w[25], offset);
      w[24] = hc_bytealign (w[23], w[24], offset);
      w[23] = hc_bytealign (w[22], w[23], offset);
      w[22] = hc_bytealign (w[21], w[22], offset);
      w[21] = hc_bytealign (w[20], w[21], offset);
      w[20] = hc_bytealign (w[19], w[20], offset);
      w[19] = hc_bytealign (w[18], w[19], offset);
      w[18] = hc_bytealign (w[17], w[18], offset);
      w[17] = hc_bytealign (w[16], w[17], offset);
      w[16] = hc_bytealign (w[15], w[16], offset);
      w[15] = hc_bytealign (w[14], w[15], offset);
      w[14] = hc_bytealign (w[13], w[14], offset);
      w[13] = hc_bytealign (w[12], w[13], offset);
      w[12] = hc_bytealign (w[11], w[12], offset);
      w[11] = hc_bytealign (w[10], w[11], offset);
      w[10] = hc_bytealign (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign (w[61], w[62], offset);
      w[62] = hc_bytealign (w[60], w[61], offset);
      w[61] = hc_bytealign (w[59], w[60], offset);
      w[60] = hc_bytealign (w[58], w[59], offset);
      w[59] = hc_bytealign (w[57], w[58], offset);
      w[58] = hc_bytealign (w[56], w[57], offset);
      w[57] = hc_bytealign (w[55], w[56], offset);
      w[56] = hc_bytealign (w[54], w[55], offset);
      w[55] = hc_bytealign (w[53], w[54], offset);
      w[54] = hc_bytealign (w[52], w[53], offset);
      w[53] = hc_bytealign (w[51], w[52], offset);
      w[52] = hc_bytealign (w[50], w[51], offset);
      w[51] = hc_bytealign (w[49], w[50], offset);
      w[50] = hc_bytealign (w[48], w[49], offset);
      w[49] = hc_bytealign (w[47], w[48], offset);
      w[48] = hc_bytealign (w[46], w[47], offset);
      w[47] = hc_bytealign (w[45], w[46], offset);
      w[46] = hc_bytealign (w[44], w[45], offset);
      w[45] = hc_bytealign (w[43], w[44], offset);
      w[44] = hc_bytealign (w[42], w[43], offset);
      w[43] = hc_bytealign (w[41], w[42], offset);
      w[42] = hc_bytealign (w[40], w[41], offset);
      w[41] = hc_bytealign (w[39], w[40], offset);
      w[40] = hc_bytealign (w[38], w[39], offset);
      w[39] = hc_bytealign (w[37], w[38], offset);
      w[38] = hc_bytealign (w[36], w[37], offset);
      w[37] = hc_bytealign (w[35], w[36], offset);
      w[36] = hc_bytealign (w[34], w[35], offset);
      w[35] = hc_bytealign (w[33], w[34], offset);
      w[34] = hc_bytealign (w[32], w[33], offset);
      w[33] = hc_bytealign (w[31], w[32], offset);
      w[32] = hc_bytealign (w[30], w[31], offset);
      w[31] = hc_bytealign (w[29], w[30], offset);
      w[30] = hc_bytealign (w[28], w[29], offset);
      w[29] = hc_bytealign (w[27], w[28], offset);
      w[28] = hc_bytealign (w[26], w[27], offset);
      w[27] = hc_bytealign (w[25], w[26], offset);
      w[26] = hc_bytealign (w[24], w[25], offset);
      w[25] = hc_bytealign (w[23], w[24], offset);
      w[24] = hc_bytealign (w[22], w[23], offset);
      w[23] = hc_bytealign (w[21], w[22], offset);
      w[22] = hc_bytealign (w[20], w[21], offset);
      w[21] = hc_bytealign (w[19], w[20], offset);
      w[20] = hc_bytealign (w[18], w[19], offset);
      w[19] = hc_bytealign (w[17], w[18], offset);
      w[18] = hc_bytealign (w[16], w[17], offset);
      w[17] = hc_bytealign (w[15], w[16], offset);
      w[16] = hc_bytealign (w[14], w[15], offset);
      w[15] = hc_bytealign (w[13], w[14], offset);
      w[14] = hc_bytealign (w[12], w[13], offset);
      w[13] = hc_bytealign (w[11], w[12], offset);
      w[12] = hc_bytealign (w[10], w[11], offset);
      w[11] = hc_bytealign (w[ 9], w[10], offset);
      w[10] = hc_bytealign (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign (w[60], w[61], offset);
      w[62] = hc_bytealign (w[59], w[60], offset);
      w[61] = hc_bytealign (w[58], w[59], offset);
      w[60] = hc_bytealign (w[57], w[58], offset);
      w[59] = hc_bytealign (w[56], w[57], offset);
      w[58] = hc_bytealign (w[55], w[56], offset);
      w[57] = hc_bytealign (w[54], w[55], offset);
      w[56] = hc_bytealign (w[53], w[54], offset);
      w[55] = hc_bytealign (w[52], w[53], offset);
      w[54] = hc_bytealign (w[51], w[52], offset);
      w[53] = hc_bytealign (w[50], w[51], offset);
      w[52] = hc_bytealign (w[49], w[50], offset);
      w[51] = hc_bytealign (w[48], w[49], offset);
      w[50] = hc_bytealign (w[47], w[48], offset);
      w[49] = hc_bytealign (w[46], w[47], offset);
      w[48] = hc_bytealign (w[45], w[46], offset);
      w[47] = hc_bytealign (w[44], w[45], offset);
      w[46] = hc_bytealign (w[43], w[44], offset);
      w[45] = hc_bytealign (w[42], w[43], offset);
      w[44] = hc_bytealign (w[41], w[42], offset);
      w[43] = hc_bytealign (w[40], w[41], offset);
      w[42] = hc_bytealign (w[39], w[40], offset);
      w[41] = hc_bytealign (w[38], w[39], offset);
      w[40] = hc_bytealign (w[37], w[38], offset);
      w[39] = hc_bytealign (w[36], w[37], offset);
      w[38] = hc_bytealign (w[35], w[36], offset);
      w[37] = hc_bytealign (w[34], w[35], offset);
      w[36] = hc_bytealign (w[33], w[34], offset);
      w[35] = hc_bytealign (w[32], w[33], offset);
      w[34] = hc_bytealign (w[31], w[32], offset);
      w[33] = hc_bytealign (w[30], w[31], offset);
      w[32] = hc_bytealign (w[29], w[30], offset);
      w[31] = hc_bytealign (w[28], w[29], offset);
      w[30] = hc_bytealign (w[27], w[28], offset);
      w[29] = hc_bytealign (w[26], w[27], offset);
      w[28] = hc_bytealign (w[25], w[26], offset);
      w[27] = hc_bytealign (w[24], w[25], offset);
      w[26] = hc_bytealign (w[23], w[24], offset);
      w[25] = hc_bytealign (w[22], w[23], offset);
      w[24] = hc_bytealign (w[21], w[22], offset);
      w[23] = hc_bytealign (w[20], w[21], offset);
      w[22] = hc_bytealign (w[19], w[20], offset);
      w[21] = hc_bytealign (w[18], w[19], offset);
      w[20] = hc_bytealign (w[17], w[18], offset);
      w[19] = hc_bytealign (w[16], w[17], offset);
      w[18] = hc_bytealign (w[15], w[16], offset);
      w[17] = hc_bytealign (w[14], w[15], offset);
      w[16] = hc_bytealign (w[13], w[14], offset);
      w[15] = hc_bytealign (w[12], w[13], offset);
      w[14] = hc_bytealign (w[11], w[12], offset);
      w[13] = hc_bytealign (w[10], w[11], offset);
      w[12] = hc_bytealign (w[ 9], w[10], offset);
      w[11] = hc_bytealign (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign (w[59], w[60], offset);
      w[62] = hc_bytealign (w[58], w[59], offset);
      w[61] = hc_bytealign (w[57], w[58], offset);
      w[60] = hc_bytealign (w[56], w[57], offset);
      w[59] = hc_bytealign (w[55], w[56], offset);
      w[58] = hc_bytealign (w[54], w[55], offset);
      w[57] = hc_bytealign (w[53], w[54], offset);
      w[56] = hc_bytealign (w[52], w[53], offset);
      w[55] = hc_bytealign (w[51], w[52], offset);
      w[54] = hc_bytealign (w[50], w[51], offset);
      w[53] = hc_bytealign (w[49], w[50], offset);
      w[52] = hc_bytealign (w[48], w[49], offset);
      w[51] = hc_bytealign (w[47], w[48], offset);
      w[50] = hc_bytealign (w[46], w[47], offset);
      w[49] = hc_bytealign (w[45], w[46], offset);
      w[48] = hc_bytealign (w[44], w[45], offset);
      w[47] = hc_bytealign (w[43], w[44], offset);
      w[46] = hc_bytealign (w[42], w[43], offset);
      w[45] = hc_bytealign (w[41], w[42], offset);
      w[44] = hc_bytealign (w[40], w[41], offset);
      w[43] = hc_bytealign (w[39], w[40], offset);
      w[42] = hc_bytealign (w[38], w[39], offset);
      w[41] = hc_bytealign (w[37], w[38], offset);
      w[40] = hc_bytealign (w[36], w[37], offset);
      w[39] = hc_bytealign (w[35], w[36], offset);
      w[38] = hc_bytealign (w[34], w[35], offset);
      w[37] = hc_bytealign (w[33], w[34], offset);
      w[36] = hc_bytealign (w[32], w[33], offset);
      w[35] = hc_bytealign (w[31], w[32], offset);
      w[34] = hc_bytealign (w[30], w[31], offset);
      w[33] = hc_bytealign (w[29], w[30], offset);
      w[32] = hc_bytealign (w[28], w[29], offset);
      w[31] = hc_bytealign (w[27], w[28], offset);
      w[30] = hc_bytealign (w[26], w[27], offset);
      w[29] = hc_bytealign (w[25], w[26], offset);
      w[28] = hc_bytealign (w[24], w[25], offset);
      w[27] = hc_bytealign (w[23], w[24], offset);
      w[26] = hc_bytealign (w[22], w[23], offset);
      w[25] = hc_bytealign (w[21], w[22], offset);
      w[24] = hc_bytealign (w[20], w[21], offset);
      w[23] = hc_bytealign (w[19], w[20], offset);
      w[22] = hc_bytealign (w[18], w[19], offset);
      w[21] = hc_bytealign (w[17], w[18], offset);
      w[20] = hc_bytealign (w[16], w[17], offset);
      w[19] = hc_bytealign (w[15], w[16], offset);
      w[18] = hc_bytealign (w[14], w[15], offset);
      w[17] = hc_bytealign (w[13], w[14], offset);
      w[16] = hc_bytealign (w[12], w[13], offset);
      w[15] = hc_bytealign (w[11], w[12], offset);
      w[14] = hc_bytealign (w[10], w[11], offset);
      w[13] = hc_bytealign (w[ 9], w[10], offset);
      w[12] = hc_bytealign (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign (w[58], w[59], offset);
      w[62] = hc_bytealign (w[57], w[58], offset);
      w[61] = hc_bytealign (w[56], w[57], offset);
      w[60] = hc_bytealign (w[55], w[56], offset);
      w[59] = hc_bytealign (w[54], w[55], offset);
      w[58] = hc_bytealign (w[53], w[54], offset);
      w[57] = hc_bytealign (w[52], w[53], offset);
      w[56] = hc_bytealign (w[51], w[52], offset);
      w[55] = hc_bytealign (w[50], w[51], offset);
      w[54] = hc_bytealign (w[49], w[50], offset);
      w[53] = hc_bytealign (w[48], w[49], offset);
      w[52] = hc_bytealign (w[47], w[48], offset);
      w[51] = hc_bytealign (w[46], w[47], offset);
      w[50] = hc_bytealign (w[45], w[46], offset);
      w[49] = hc_bytealign (w[44], w[45], offset);
      w[48] = hc_bytealign (w[43], w[44], offset);
      w[47] = hc_bytealign (w[42], w[43], offset);
      w[46] = hc_bytealign (w[41], w[42], offset);
      w[45] = hc_bytealign (w[40], w[41], offset);
      w[44] = hc_bytealign (w[39], w[40], offset);
      w[43] = hc_bytealign (w[38], w[39], offset);
      w[42] = hc_bytealign (w[37], w[38], offset);
      w[41] = hc_bytealign (w[36], w[37], offset);
      w[40] = hc_bytealign (w[35], w[36], offset);
      w[39] = hc_bytealign (w[34], w[35], offset);
      w[38] = hc_bytealign (w[33], w[34], offset);
      w[37] = hc_bytealign (w[32], w[33], offset);
      w[36] = hc_bytealign (w[31], w[32], offset);
      w[35] = hc_bytealign (w[30], w[31], offset);
      w[34] = hc_bytealign (w[29], w[30], offset);
      w[33] = hc_bytealign (w[28], w[29], offset);
      w[32] = hc_bytealign (w[27], w[28], offset);
      w[31] = hc_bytealign (w[26], w[27], offset);
      w[30] = hc_bytealign (w[25], w[26], offset);
      w[29] = hc_bytealign (w[24], w[25], offset);
      w[28] = hc_bytealign (w[23], w[24], offset);
      w[27] = hc_bytealign (w[22], w[23], offset);
      w[26] = hc_bytealign (w[21], w[22], offset);
      w[25] = hc_bytealign (w[20], w[21], offset);
      w[24] = hc_bytealign (w[19], w[20], offset);
      w[23] = hc_bytealign (w[18], w[19], offset);
      w[22] = hc_bytealign (w[17], w[18], offset);
      w[21] = hc_bytealign (w[16], w[17], offset);
      w[20] = hc_bytealign (w[15], w[16], offset);
      w[19] = hc_bytealign (w[14], w[15], offset);
      w[18] = hc_bytealign (w[13], w[14], offset);
      w[17] = hc_bytealign (w[12], w[13], offset);
      w[16] = hc_bytealign (w[11], w[12], offset);
      w[15] = hc_bytealign (w[10], w[11], offset);
      w[14] = hc_bytealign (w[ 9], w[10], offset);
      w[13] = hc_bytealign (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign (w[57], w[58], offset);
      w[62] = hc_bytealign (w[56], w[57], offset);
      w[61] = hc_bytealign (w[55], w[56], offset);
      w[60] = hc_bytealign (w[54], w[55], offset);
      w[59] = hc_bytealign (w[53], w[54], offset);
      w[58] = hc_bytealign (w[52], w[53], offset);
      w[57] = hc_bytealign (w[51], w[52], offset);
      w[56] = hc_bytealign (w[50], w[51], offset);
      w[55] = hc_bytealign (w[49], w[50], offset);
      w[54] = hc_bytealign (w[48], w[49], offset);
      w[53] = hc_bytealign (w[47], w[48], offset);
      w[52] = hc_bytealign (w[46], w[47], offset);
      w[51] = hc_bytealign (w[45], w[46], offset);
      w[50] = hc_bytealign (w[44], w[45], offset);
      w[49] = hc_bytealign (w[43], w[44], offset);
      w[48] = hc_bytealign (w[42], w[43], offset);
      w[47] = hc_bytealign (w[41], w[42], offset);
      w[46] = hc_bytealign (w[40], w[41], offset);
      w[45] = hc_bytealign (w[39], w[40], offset);
      w[44] = hc_bytealign (w[38], w[39], offset);
      w[43] = hc_bytealign (w[37], w[38], offset);
      w[42] = hc_bytealign (w[36], w[37], offset);
      w[41] = hc_bytealign (w[35], w[36], offset);
      w[40] = hc_bytealign (w[34], w[35], offset);
      w[39] = hc_bytealign (w[33], w[34], offset);
      w[38] = hc_bytealign (w[32], w[33], offset);
      w[37] = hc_bytealign (w[31], w[32], offset);
      w[36] = hc_bytealign (w[30], w[31], offset);
      w[35] = hc_bytealign (w[29], w[30], offset);
      w[34] = hc_bytealign (w[28], w[29], offset);
      w[33] = hc_bytealign (w[27], w[28], offset);
      w[32] = hc_bytealign (w[26], w[27], offset);
      w[31] = hc_bytealign (w[25], w[26], offset);
      w[30] = hc_bytealign (w[24], w[25], offset);
      w[29] = hc_bytealign (w[23], w[24], offset);
      w[28] = hc_bytealign (w[22], w[23], offset);
      w[27] = hc_bytealign (w[21], w[22], offset);
      w[26] = hc_bytealign (w[20], w[21], offset);
      w[25] = hc_bytealign (w[19], w[20], offset);
      w[24] = hc_bytealign (w[18], w[19], offset);
      w[23] = hc_bytealign (w[17], w[18], offset);
      w[22] = hc_bytealign (w[16], w[17], offset);
      w[21] = hc_bytealign (w[15], w[16], offset);
      w[20] = hc_bytealign (w[14], w[15], offset);
      w[19] = hc_bytealign (w[13], w[14], offset);
      w[18] = hc_bytealign (w[12], w[13], offset);
      w[17] = hc_bytealign (w[11], w[12], offset);
      w[16] = hc_bytealign (w[10], w[11], offset);
      w[15] = hc_bytealign (w[ 9], w[10], offset);
      w[14] = hc_bytealign (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign (w[56], w[57], offset);
      w[62] = hc_bytealign (w[55], w[56], offset);
      w[61] = hc_bytealign (w[54], w[55], offset);
      w[60] = hc_bytealign (w[53], w[54], offset);
      w[59] = hc_bytealign (w[52], w[53], offset);
      w[58] = hc_bytealign (w[51], w[52], offset);
      w[57] = hc_bytealign (w[50], w[51], offset);
      w[56] = hc_bytealign (w[49], w[50], offset);
      w[55] = hc_bytealign (w[48], w[49], offset);
      w[54] = hc_bytealign (w[47], w[48], offset);
      w[53] = hc_bytealign (w[46], w[47], offset);
      w[52] = hc_bytealign (w[45], w[46], offset);
      w[51] = hc_bytealign (w[44], w[45], offset);
      w[50] = hc_bytealign (w[43], w[44], offset);
      w[49] = hc_bytealign (w[42], w[43], offset);
      w[48] = hc_bytealign (w[41], w[42], offset);
      w[47] = hc_bytealign (w[40], w[41], offset);
      w[46] = hc_bytealign (w[39], w[40], offset);
      w[45] = hc_bytealign (w[38], w[39], offset);
      w[44] = hc_bytealign (w[37], w[38], offset);
      w[43] = hc_bytealign (w[36], w[37], offset);
      w[42] = hc_bytealign (w[35], w[36], offset);
      w[41] = hc_bytealign (w[34], w[35], offset);
      w[40] = hc_bytealign (w[33], w[34], offset);
      w[39] = hc_bytealign (w[32], w[33], offset);
      w[38] = hc_bytealign (w[31], w[32], offset);
      w[37] = hc_bytealign (w[30], w[31], offset);
      w[36] = hc_bytealign (w[29], w[30], offset);
      w[35] = hc_bytealign (w[28], w[29], offset);
      w[34] = hc_bytealign (w[27], w[28], offset);
      w[33] = hc_bytealign (w[26], w[27], offset);
      w[32] = hc_bytealign (w[25], w[26], offset);
      w[31] = hc_bytealign (w[24], w[25], offset);
      w[30] = hc_bytealign (w[23], w[24], offset);
      w[29] = hc_bytealign (w[22], w[23], offset);
      w[28] = hc_bytealign (w[21], w[22], offset);
      w[27] = hc_bytealign (w[20], w[21], offset);
      w[26] = hc_bytealign (w[19], w[20], offset);
      w[25] = hc_bytealign (w[18], w[19], offset);
      w[24] = hc_bytealign (w[17], w[18], offset);
      w[23] = hc_bytealign (w[16], w[17], offset);
      w[22] = hc_bytealign (w[15], w[16], offset);
      w[21] = hc_bytealign (w[14], w[15], offset);
      w[20] = hc_bytealign (w[13], w[14], offset);
      w[19] = hc_bytealign (w[12], w[13], offset);
      w[18] = hc_bytealign (w[11], w[12], offset);
      w[17] = hc_bytealign (w[10], w[11], offset);
      w[16] = hc_bytealign (w[ 9], w[10], offset);
      w[15] = hc_bytealign (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign (w[55], w[56], offset);
      w[62] = hc_bytealign (w[54], w[55], offset);
      w[61] = hc_bytealign (w[53], w[54], offset);
      w[60] = hc_bytealign (w[52], w[53], offset);
      w[59] = hc_bytealign (w[51], w[52], offset);
      w[58] = hc_bytealign (w[50], w[51], offset);
      w[57] = hc_bytealign (w[49], w[50], offset);
      w[56] = hc_bytealign (w[48], w[49], offset);
      w[55] = hc_bytealign (w[47], w[48], offset);
      w[54] = hc_bytealign (w[46], w[47], offset);
      w[53] = hc_bytealign (w[45], w[46], offset);
      w[52] = hc_bytealign (w[44], w[45], offset);
      w[51] = hc_bytealign (w[43], w[44], offset);
      w[50] = hc_bytealign (w[42], w[43], offset);
      w[49] = hc_bytealign (w[41], w[42], offset);
      w[48] = hc_bytealign (w[40], w[41], offset);
      w[47] = hc_bytealign (w[39], w[40], offset);
      w[46] = hc_bytealign (w[38], w[39], offset);
      w[45] = hc_bytealign (w[37], w[38], offset);
      w[44] = hc_bytealign (w[36], w[37], offset);
      w[43] = hc_bytealign (w[35], w[36], offset);
      w[42] = hc_bytealign (w[34], w[35], offset);
      w[41] = hc_bytealign (w[33], w[34], offset);
      w[40] = hc_bytealign (w[32], w[33], offset);
      w[39] = hc_bytealign (w[31], w[32], offset);
      w[38] = hc_bytealign (w[30], w[31], offset);
      w[37] = hc_bytealign (w[29], w[30], offset);
      w[36] = hc_bytealign (w[28], w[29], offset);
      w[35] = hc_bytealign (w[27], w[28], offset);
      w[34] = hc_bytealign (w[26], w[27], offset);
      w[33] = hc_bytealign (w[25], w[26], offset);
      w[32] = hc_bytealign (w[24], w[25], offset);
      w[31] = hc_bytealign (w[23], w[24], offset);
      w[30] = hc_bytealign (w[22], w[23], offset);
      w[29] = hc_bytealign (w[21], w[22], offset);
      w[28] = hc_bytealign (w[20], w[21], offset);
      w[27] = hc_bytealign (w[19], w[20], offset);
      w[26] = hc_bytealign (w[18], w[19], offset);
      w[25] = hc_bytealign (w[17], w[18], offset);
      w[24] = hc_bytealign (w[16], w[17], offset);
      w[23] = hc_bytealign (w[15], w[16], offset);
      w[22] = hc_bytealign (w[14], w[15], offset);
      w[21] = hc_bytealign (w[13], w[14], offset);
      w[20] = hc_bytealign (w[12], w[13], offset);
      w[19] = hc_bytealign (w[11], w[12], offset);
      w[18] = hc_bytealign (w[10], w[11], offset);
      w[17] = hc_bytealign (w[ 9], w[10], offset);
      w[16] = hc_bytealign (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign (w[54], w[55], offset);
      w[62] = hc_bytealign (w[53], w[54], offset);
      w[61] = hc_bytealign (w[52], w[53], offset);
      w[60] = hc_bytealign (w[51], w[52], offset);
      w[59] = hc_bytealign (w[50], w[51], offset);
      w[58] = hc_bytealign (w[49], w[50], offset);
      w[57] = hc_bytealign (w[48], w[49], offset);
      w[56] = hc_bytealign (w[47], w[48], offset);
      w[55] = hc_bytealign (w[46], w[47], offset);
      w[54] = hc_bytealign (w[45], w[46], offset);
      w[53] = hc_bytealign (w[44], w[45], offset);
      w[52] = hc_bytealign (w[43], w[44], offset);
      w[51] = hc_bytealign (w[42], w[43], offset);
      w[50] = hc_bytealign (w[41], w[42], offset);
      w[49] = hc_bytealign (w[40], w[41], offset);
      w[48] = hc_bytealign (w[39], w[40], offset);
      w[47] = hc_bytealign (w[38], w[39], offset);
      w[46] = hc_bytealign (w[37], w[38], offset);
      w[45] = hc_bytealign (w[36], w[37], offset);
      w[44] = hc_bytealign (w[35], w[36], offset);
      w[43] = hc_bytealign (w[34], w[35], offset);
      w[42] = hc_bytealign (w[33], w[34], offset);
      w[41] = hc_bytealign (w[32], w[33], offset);
      w[40] = hc_bytealign (w[31], w[32], offset);
      w[39] = hc_bytealign (w[30], w[31], offset);
      w[38] = hc_bytealign (w[29], w[30], offset);
      w[37] = hc_bytealign (w[28], w[29], offset);
      w[36] = hc_bytealign (w[27], w[28], offset);
      w[35] = hc_bytealign (w[26], w[27], offset);
      w[34] = hc_bytealign (w[25], w[26], offset);
      w[33] = hc_bytealign (w[24], w[25], offset);
      w[32] = hc_bytealign (w[23], w[24], offset);
      w[31] = hc_bytealign (w[22], w[23], offset);
      w[30] = hc_bytealign (w[21], w[22], offset);
      w[29] = hc_bytealign (w[20], w[21], offset);
      w[28] = hc_bytealign (w[19], w[20], offset);
      w[27] = hc_bytealign (w[18], w[19], offset);
      w[26] = hc_bytealign (w[17], w[18], offset);
      w[25] = hc_bytealign (w[16], w[17], offset);
      w[24] = hc_bytealign (w[15], w[16], offset);
      w[23] = hc_bytealign (w[14], w[15], offset);
      w[22] = hc_bytealign (w[13], w[14], offset);
      w[21] = hc_bytealign (w[12], w[13], offset);
      w[20] = hc_bytealign (w[11], w[12], offset);
      w[19] = hc_bytealign (w[10], w[11], offset);
      w[18] = hc_bytealign (w[ 9], w[10], offset);
      w[17] = hc_bytealign (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign (w[53], w[54], offset);
      w[62] = hc_bytealign (w[52], w[53], offset);
      w[61] = hc_bytealign (w[51], w[52], offset);
      w[60] = hc_bytealign (w[50], w[51], offset);
      w[59] = hc_bytealign (w[49], w[50], offset);
      w[58] = hc_bytealign (w[48], w[49], offset);
      w[57] = hc_bytealign (w[47], w[48], offset);
      w[56] = hc_bytealign (w[46], w[47], offset);
      w[55] = hc_bytealign (w[45], w[46], offset);
      w[54] = hc_bytealign (w[44], w[45], offset);
      w[53] = hc_bytealign (w[43], w[44], offset);
      w[52] = hc_bytealign (w[42], w[43], offset);
      w[51] = hc_bytealign (w[41], w[42], offset);
      w[50] = hc_bytealign (w[40], w[41], offset);
      w[49] = hc_bytealign (w[39], w[40], offset);
      w[48] = hc_bytealign (w[38], w[39], offset);
      w[47] = hc_bytealign (w[37], w[38], offset);
      w[46] = hc_bytealign (w[36], w[37], offset);
      w[45] = hc_bytealign (w[35], w[36], offset);
      w[44] = hc_bytealign (w[34], w[35], offset);
      w[43] = hc_bytealign (w[33], w[34], offset);
      w[42] = hc_bytealign (w[32], w[33], offset);
      w[41] = hc_bytealign (w[31], w[32], offset);
      w[40] = hc_bytealign (w[30], w[31], offset);
      w[39] = hc_bytealign (w[29], w[30], offset);
      w[38] = hc_bytealign (w[28], w[29], offset);
      w[37] = hc_bytealign (w[27], w[28], offset);
      w[36] = hc_bytealign (w[26], w[27], offset);
      w[35] = hc_bytealign (w[25], w[26], offset);
      w[34] = hc_bytealign (w[24], w[25], offset);
      w[33] = hc_bytealign (w[23], w[24], offset);
      w[32] = hc_bytealign (w[22], w[23], offset);
      w[31] = hc_bytealign (w[21], w[22], offset);
      w[30] = hc_bytealign (w[20], w[21], offset);
      w[29] = hc_bytealign (w[19], w[20], offset);
      w[28] = hc_bytealign (w[18], w[19], offset);
      w[27] = hc_bytealign (w[17], w[18], offset);
      w[26] = hc_bytealign (w[16], w[17], offset);
      w[25] = hc_bytealign (w[15], w[16], offset);
      w[24] = hc_bytealign (w[14], w[15], offset);
      w[23] = hc_bytealign (w[13], w[14], offset);
      w[22] = hc_bytealign (w[12], w[13], offset);
      w[21] = hc_bytealign (w[11], w[12], offset);
      w[20] = hc_bytealign (w[10], w[11], offset);
      w[19] = hc_bytealign (w[ 9], w[10], offset);
      w[18] = hc_bytealign (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign (w[52], w[53], offset);
      w[62] = hc_bytealign (w[51], w[52], offset);
      w[61] = hc_bytealign (w[50], w[51], offset);
      w[60] = hc_bytealign (w[49], w[50], offset);
      w[59] = hc_bytealign (w[48], w[49], offset);
      w[58] = hc_bytealign (w[47], w[48], offset);
      w[57] = hc_bytealign (w[46], w[47], offset);
      w[56] = hc_bytealign (w[45], w[46], offset);
      w[55] = hc_bytealign (w[44], w[45], offset);
      w[54] = hc_bytealign (w[43], w[44], offset);
      w[53] = hc_bytealign (w[42], w[43], offset);
      w[52] = hc_bytealign (w[41], w[42], offset);
      w[51] = hc_bytealign (w[40], w[41], offset);
      w[50] = hc_bytealign (w[39], w[40], offset);
      w[49] = hc_bytealign (w[38], w[39], offset);
      w[48] = hc_bytealign (w[37], w[38], offset);
      w[47] = hc_bytealign (w[36], w[37], offset);
      w[46] = hc_bytealign (w[35], w[36], offset);
      w[45] = hc_bytealign (w[34], w[35], offset);
      w[44] = hc_bytealign (w[33], w[34], offset);
      w[43] = hc_bytealign (w[32], w[33], offset);
      w[42] = hc_bytealign (w[31], w[32], offset);
      w[41] = hc_bytealign (w[30], w[31], offset);
      w[40] = hc_bytealign (w[29], w[30], offset);
      w[39] = hc_bytealign (w[28], w[29], offset);
      w[38] = hc_bytealign (w[27], w[28], offset);
      w[37] = hc_bytealign (w[26], w[27], offset);
      w[36] = hc_bytealign (w[25], w[26], offset);
      w[35] = hc_bytealign (w[24], w[25], offset);
      w[34] = hc_bytealign (w[23], w[24], offset);
      w[33] = hc_bytealign (w[22], w[23], offset);
      w[32] = hc_bytealign (w[21], w[22], offset);
      w[31] = hc_bytealign (w[20], w[21], offset);
      w[30] = hc_bytealign (w[19], w[20], offset);
      w[29] = hc_bytealign (w[18], w[19], offset);
      w[28] = hc_bytealign (w[17], w[18], offset);
      w[27] = hc_bytealign (w[16], w[17], offset);
      w[26] = hc_bytealign (w[15], w[16], offset);
      w[25] = hc_bytealign (w[14], w[15], offset);
      w[24] = hc_bytealign (w[13], w[14], offset);
      w[23] = hc_bytealign (w[12], w[13], offset);
      w[22] = hc_bytealign (w[11], w[12], offset);
      w[21] = hc_bytealign (w[10], w[11], offset);
      w[20] = hc_bytealign (w[ 9], w[10], offset);
      w[19] = hc_bytealign (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign (w[51], w[52], offset);
      w[62] = hc_bytealign (w[50], w[51], offset);
      w[61] = hc_bytealign (w[49], w[50], offset);
      w[60] = hc_bytealign (w[48], w[49], offset);
      w[59] = hc_bytealign (w[47], w[48], offset);
      w[58] = hc_bytealign (w[46], w[47], offset);
      w[57] = hc_bytealign (w[45], w[46], offset);
      w[56] = hc_bytealign (w[44], w[45], offset);
      w[55] = hc_bytealign (w[43], w[44], offset);
      w[54] = hc_bytealign (w[42], w[43], offset);
      w[53] = hc_bytealign (w[41], w[42], offset);
      w[52] = hc_bytealign (w[40], w[41], offset);
      w[51] = hc_bytealign (w[39], w[40], offset);
      w[50] = hc_bytealign (w[38], w[39], offset);
      w[49] = hc_bytealign (w[37], w[38], offset);
      w[48] = hc_bytealign (w[36], w[37], offset);
      w[47] = hc_bytealign (w[35], w[36], offset);
      w[46] = hc_bytealign (w[34], w[35], offset);
      w[45] = hc_bytealign (w[33], w[34], offset);
      w[44] = hc_bytealign (w[32], w[33], offset);
      w[43] = hc_bytealign (w[31], w[32], offset);
      w[42] = hc_bytealign (w[30], w[31], offset);
      w[41] = hc_bytealign (w[29], w[30], offset);
      w[40] = hc_bytealign (w[28], w[29], offset);
      w[39] = hc_bytealign (w[27], w[28], offset);
      w[38] = hc_bytealign (w[26], w[27], offset);
      w[37] = hc_bytealign (w[25], w[26], offset);
      w[36] = hc_bytealign (w[24], w[25], offset);
      w[35] = hc_bytealign (w[23], w[24], offset);
      w[34] = hc_bytealign (w[22], w[23], offset);
      w[33] = hc_bytealign (w[21], w[22], offset);
      w[32] = hc_bytealign (w[20], w[21], offset);
      w[31] = hc_bytealign (w[19], w[20], offset);
      w[30] = hc_bytealign (w[18], w[19], offset);
      w[29] = hc_bytealign (w[17], w[18], offset);
      w[28] = hc_bytealign (w[16], w[17], offset);
      w[27] = hc_bytealign (w[15], w[16], offset);
      w[26] = hc_bytealign (w[14], w[15], offset);
      w[25] = hc_bytealign (w[13], w[14], offset);
      w[24] = hc_bytealign (w[12], w[13], offset);
      w[23] = hc_bytealign (w[11], w[12], offset);
      w[22] = hc_bytealign (w[10], w[11], offset);
      w[21] = hc_bytealign (w[ 9], w[10], offset);
      w[20] = hc_bytealign (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign (w[50], w[51], offset);
      w[62] = hc_bytealign (w[49], w[50], offset);
      w[61] = hc_bytealign (w[48], w[49], offset);
      w[60] = hc_bytealign (w[47], w[48], offset);
      w[59] = hc_bytealign (w[46], w[47], offset);
      w[58] = hc_bytealign (w[45], w[46], offset);
      w[57] = hc_bytealign (w[44], w[45], offset);
      w[56] = hc_bytealign (w[43], w[44], offset);
      w[55] = hc_bytealign (w[42], w[43], offset);
      w[54] = hc_bytealign (w[41], w[42], offset);
      w[53] = hc_bytealign (w[40], w[41], offset);
      w[52] = hc_bytealign (w[39], w[40], offset);
      w[51] = hc_bytealign (w[38], w[39], offset);
      w[50] = hc_bytealign (w[37], w[38], offset);
      w[49] = hc_bytealign (w[36], w[37], offset);
      w[48] = hc_bytealign (w[35], w[36], offset);
      w[47] = hc_bytealign (w[34], w[35], offset);
      w[46] = hc_bytealign (w[33], w[34], offset);
      w[45] = hc_bytealign (w[32], w[33], offset);
      w[44] = hc_bytealign (w[31], w[32], offset);
      w[43] = hc_bytealign (w[30], w[31], offset);
      w[42] = hc_bytealign (w[29], w[30], offset);
      w[41] = hc_bytealign (w[28], w[29], offset);
      w[40] = hc_bytealign (w[27], w[28], offset);
      w[39] = hc_bytealign (w[26], w[27], offset);
      w[38] = hc_bytealign (w[25], w[26], offset);
      w[37] = hc_bytealign (w[24], w[25], offset);
      w[36] = hc_bytealign (w[23], w[24], offset);
      w[35] = hc_bytealign (w[22], w[23], offset);
      w[34] = hc_bytealign (w[21], w[22], offset);
      w[33] = hc_bytealign (w[20], w[21], offset);
      w[32] = hc_bytealign (w[19], w[20], offset);
      w[31] = hc_bytealign (w[18], w[19], offset);
      w[30] = hc_bytealign (w[17], w[18], offset);
      w[29] = hc_bytealign (w[16], w[17], offset);
      w[28] = hc_bytealign (w[15], w[16], offset);
      w[27] = hc_bytealign (w[14], w[15], offset);
      w[26] = hc_bytealign (w[13], w[14], offset);
      w[25] = hc_bytealign (w[12], w[13], offset);
      w[24] = hc_bytealign (w[11], w[12], offset);
      w[23] = hc_bytealign (w[10], w[11], offset);
      w[22] = hc_bytealign (w[ 9], w[10], offset);
      w[21] = hc_bytealign (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign (w[49], w[50], offset);
      w[62] = hc_bytealign (w[48], w[49], offset);
      w[61] = hc_bytealign (w[47], w[48], offset);
      w[60] = hc_bytealign (w[46], w[47], offset);
      w[59] = hc_bytealign (w[45], w[46], offset);
      w[58] = hc_bytealign (w[44], w[45], offset);
      w[57] = hc_bytealign (w[43], w[44], offset);
      w[56] = hc_bytealign (w[42], w[43], offset);
      w[55] = hc_bytealign (w[41], w[42], offset);
      w[54] = hc_bytealign (w[40], w[41], offset);
      w[53] = hc_bytealign (w[39], w[40], offset);
      w[52] = hc_bytealign (w[38], w[39], offset);
      w[51] = hc_bytealign (w[37], w[38], offset);
      w[50] = hc_bytealign (w[36], w[37], offset);
      w[49] = hc_bytealign (w[35], w[36], offset);
      w[48] = hc_bytealign (w[34], w[35], offset);
      w[47] = hc_bytealign (w[33], w[34], offset);
      w[46] = hc_bytealign (w[32], w[33], offset);
      w[45] = hc_bytealign (w[31], w[32], offset);
      w[44] = hc_bytealign (w[30], w[31], offset);
      w[43] = hc_bytealign (w[29], w[30], offset);
      w[42] = hc_bytealign (w[28], w[29], offset);
      w[41] = hc_bytealign (w[27], w[28], offset);
      w[40] = hc_bytealign (w[26], w[27], offset);
      w[39] = hc_bytealign (w[25], w[26], offset);
      w[38] = hc_bytealign (w[24], w[25], offset);
      w[37] = hc_bytealign (w[23], w[24], offset);
      w[36] = hc_bytealign (w[22], w[23], offset);
      w[35] = hc_bytealign (w[21], w[22], offset);
      w[34] = hc_bytealign (w[20], w[21], offset);
      w[33] = hc_bytealign (w[19], w[20], offset);
      w[32] = hc_bytealign (w[18], w[19], offset);
      w[31] = hc_bytealign (w[17], w[18], offset);
      w[30] = hc_bytealign (w[16], w[17], offset);
      w[29] = hc_bytealign (w[15], w[16], offset);
      w[28] = hc_bytealign (w[14], w[15], offset);
      w[27] = hc_bytealign (w[13], w[14], offset);
      w[26] = hc_bytealign (w[12], w[13], offset);
      w[25] = hc_bytealign (w[11], w[12], offset);
      w[24] = hc_bytealign (w[10], w[11], offset);
      w[23] = hc_bytealign (w[ 9], w[10], offset);
      w[22] = hc_bytealign (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign (w[48], w[49], offset);
      w[62] = hc_bytealign (w[47], w[48], offset);
      w[61] = hc_bytealign (w[46], w[47], offset);
      w[60] = hc_bytealign (w[45], w[46], offset);
      w[59] = hc_bytealign (w[44], w[45], offset);
      w[58] = hc_bytealign (w[43], w[44], offset);
      w[57] = hc_bytealign (w[42], w[43], offset);
      w[56] = hc_bytealign (w[41], w[42], offset);
      w[55] = hc_bytealign (w[40], w[41], offset);
      w[54] = hc_bytealign (w[39], w[40], offset);
      w[53] = hc_bytealign (w[38], w[39], offset);
      w[52] = hc_bytealign (w[37], w[38], offset);
      w[51] = hc_bytealign (w[36], w[37], offset);
      w[50] = hc_bytealign (w[35], w[36], offset);
      w[49] = hc_bytealign (w[34], w[35], offset);
      w[48] = hc_bytealign (w[33], w[34], offset);
      w[47] = hc_bytealign (w[32], w[33], offset);
      w[46] = hc_bytealign (w[31], w[32], offset);
      w[45] = hc_bytealign (w[30], w[31], offset);
      w[44] = hc_bytealign (w[29], w[30], offset);
      w[43] = hc_bytealign (w[28], w[29], offset);
      w[42] = hc_bytealign (w[27], w[28], offset);
      w[41] = hc_bytealign (w[26], w[27], offset);
      w[40] = hc_bytealign (w[25], w[26], offset);
      w[39] = hc_bytealign (w[24], w[25], offset);
      w[38] = hc_bytealign (w[23], w[24], offset);
      w[37] = hc_bytealign (w[22], w[23], offset);
      w[36] = hc_bytealign (w[21], w[22], offset);
      w[35] = hc_bytealign (w[20], w[21], offset);
      w[34] = hc_bytealign (w[19], w[20], offset);
      w[33] = hc_bytealign (w[18], w[19], offset);
      w[32] = hc_bytealign (w[17], w[18], offset);
      w[31] = hc_bytealign (w[16], w[17], offset);
      w[30] = hc_bytealign (w[15], w[16], offset);
      w[29] = hc_bytealign (w[14], w[15], offset);
      w[28] = hc_bytealign (w[13], w[14], offset);
      w[27] = hc_bytealign (w[12], w[13], offset);
      w[26] = hc_bytealign (w[11], w[12], offset);
      w[25] = hc_bytealign (w[10], w[11], offset);
      w[24] = hc_bytealign (w[ 9], w[10], offset);
      w[23] = hc_bytealign (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign (w[47], w[48], offset);
      w[62] = hc_bytealign (w[46], w[47], offset);
      w[61] = hc_bytealign (w[45], w[46], offset);
      w[60] = hc_bytealign (w[44], w[45], offset);
      w[59] = hc_bytealign (w[43], w[44], offset);
      w[58] = hc_bytealign (w[42], w[43], offset);
      w[57] = hc_bytealign (w[41], w[42], offset);
      w[56] = hc_bytealign (w[40], w[41], offset);
      w[55] = hc_bytealign (w[39], w[40], offset);
      w[54] = hc_bytealign (w[38], w[39], offset);
      w[53] = hc_bytealign (w[37], w[38], offset);
      w[52] = hc_bytealign (w[36], w[37], offset);
      w[51] = hc_bytealign (w[35], w[36], offset);
      w[50] = hc_bytealign (w[34], w[35], offset);
      w[49] = hc_bytealign (w[33], w[34], offset);
      w[48] = hc_bytealign (w[32], w[33], offset);
      w[47] = hc_bytealign (w[31], w[32], offset);
      w[46] = hc_bytealign (w[30], w[31], offset);
      w[45] = hc_bytealign (w[29], w[30], offset);
      w[44] = hc_bytealign (w[28], w[29], offset);
      w[43] = hc_bytealign (w[27], w[28], offset);
      w[42] = hc_bytealign (w[26], w[27], offset);
      w[41] = hc_bytealign (w[25], w[26], offset);
      w[40] = hc_bytealign (w[24], w[25], offset);
      w[39] = hc_bytealign (w[23], w[24], offset);
      w[38] = hc_bytealign (w[22], w[23], offset);
      w[37] = hc_bytealign (w[21], w[22], offset);
      w[36] = hc_bytealign (w[20], w[21], offset);
      w[35] = hc_bytealign (w[19], w[20], offset);
      w[34] = hc_bytealign (w[18], w[19], offset);
      w[33] = hc_bytealign (w[17], w[18], offset);
      w[32] = hc_bytealign (w[16], w[17], offset);
      w[31] = hc_bytealign (w[15], w[16], offset);
      w[30] = hc_bytealign (w[14], w[15], offset);
      w[29] = hc_bytealign (w[13], w[14], offset);
      w[28] = hc_bytealign (w[12], w[13], offset);
      w[27] = hc_bytealign (w[11], w[12], offset);
      w[26] = hc_bytealign (w[10], w[11], offset);
      w[25] = hc_bytealign (w[ 9], w[10], offset);
      w[24] = hc_bytealign (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign (w[46], w[47], offset);
      w[62] = hc_bytealign (w[45], w[46], offset);
      w[61] = hc_bytealign (w[44], w[45], offset);
      w[60] = hc_bytealign (w[43], w[44], offset);
      w[59] = hc_bytealign (w[42], w[43], offset);
      w[58] = hc_bytealign (w[41], w[42], offset);
      w[57] = hc_bytealign (w[40], w[41], offset);
      w[56] = hc_bytealign (w[39], w[40], offset);
      w[55] = hc_bytealign (w[38], w[39], offset);
      w[54] = hc_bytealign (w[37], w[38], offset);
      w[53] = hc_bytealign (w[36], w[37], offset);
      w[52] = hc_bytealign (w[35], w[36], offset);
      w[51] = hc_bytealign (w[34], w[35], offset);
      w[50] = hc_bytealign (w[33], w[34], offset);
      w[49] = hc_bytealign (w[32], w[33], offset);
      w[48] = hc_bytealign (w[31], w[32], offset);
      w[47] = hc_bytealign (w[30], w[31], offset);
      w[46] = hc_bytealign (w[29], w[30], offset);
      w[45] = hc_bytealign (w[28], w[29], offset);
      w[44] = hc_bytealign (w[27], w[28], offset);
      w[43] = hc_bytealign (w[26], w[27], offset);
      w[42] = hc_bytealign (w[25], w[26], offset);
      w[41] = hc_bytealign (w[24], w[25], offset);
      w[40] = hc_bytealign (w[23], w[24], offset);
      w[39] = hc_bytealign (w[22], w[23], offset);
      w[38] = hc_bytealign (w[21], w[22], offset);
      w[37] = hc_bytealign (w[20], w[21], offset);
      w[36] = hc_bytealign (w[19], w[20], offset);
      w[35] = hc_bytealign (w[18], w[19], offset);
      w[34] = hc_bytealign (w[17], w[18], offset);
      w[33] = hc_bytealign (w[16], w[17], offset);
      w[32] = hc_bytealign (w[15], w[16], offset);
      w[31] = hc_bytealign (w[14], w[15], offset);
      w[30] = hc_bytealign (w[13], w[14], offset);
      w[29] = hc_bytealign (w[12], w[13], offset);
      w[28] = hc_bytealign (w[11], w[12], offset);
      w[27] = hc_bytealign (w[10], w[11], offset);
      w[26] = hc_bytealign (w[ 9], w[10], offset);
      w[25] = hc_bytealign (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign (w[45], w[46], offset);
      w[62] = hc_bytealign (w[44], w[45], offset);
      w[61] = hc_bytealign (w[43], w[44], offset);
      w[60] = hc_bytealign (w[42], w[43], offset);
      w[59] = hc_bytealign (w[41], w[42], offset);
      w[58] = hc_bytealign (w[40], w[41], offset);
      w[57] = hc_bytealign (w[39], w[40], offset);
      w[56] = hc_bytealign (w[38], w[39], offset);
      w[55] = hc_bytealign (w[37], w[38], offset);
      w[54] = hc_bytealign (w[36], w[37], offset);
      w[53] = hc_bytealign (w[35], w[36], offset);
      w[52] = hc_bytealign (w[34], w[35], offset);
      w[51] = hc_bytealign (w[33], w[34], offset);
      w[50] = hc_bytealign (w[32], w[33], offset);
      w[49] = hc_bytealign (w[31], w[32], offset);
      w[48] = hc_bytealign (w[30], w[31], offset);
      w[47] = hc_bytealign (w[29], w[30], offset);
      w[46] = hc_bytealign (w[28], w[29], offset);
      w[45] = hc_bytealign (w[27], w[28], offset);
      w[44] = hc_bytealign (w[26], w[27], offset);
      w[43] = hc_bytealign (w[25], w[26], offset);
      w[42] = hc_bytealign (w[24], w[25], offset);
      w[41] = hc_bytealign (w[23], w[24], offset);
      w[40] = hc_bytealign (w[22], w[23], offset);
      w[39] = hc_bytealign (w[21], w[22], offset);
      w[38] = hc_bytealign (w[20], w[21], offset);
      w[37] = hc_bytealign (w[19], w[20], offset);
      w[36] = hc_bytealign (w[18], w[19], offset);
      w[35] = hc_bytealign (w[17], w[18], offset);
      w[34] = hc_bytealign (w[16], w[17], offset);
      w[33] = hc_bytealign (w[15], w[16], offset);
      w[32] = hc_bytealign (w[14], w[15], offset);
      w[31] = hc_bytealign (w[13], w[14], offset);
      w[30] = hc_bytealign (w[12], w[13], offset);
      w[29] = hc_bytealign (w[11], w[12], offset);
      w[28] = hc_bytealign (w[10], w[11], offset);
      w[27] = hc_bytealign (w[ 9], w[10], offset);
      w[26] = hc_bytealign (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign (w[44], w[45], offset);
      w[62] = hc_bytealign (w[43], w[44], offset);
      w[61] = hc_bytealign (w[42], w[43], offset);
      w[60] = hc_bytealign (w[41], w[42], offset);
      w[59] = hc_bytealign (w[40], w[41], offset);
      w[58] = hc_bytealign (w[39], w[40], offset);
      w[57] = hc_bytealign (w[38], w[39], offset);
      w[56] = hc_bytealign (w[37], w[38], offset);
      w[55] = hc_bytealign (w[36], w[37], offset);
      w[54] = hc_bytealign (w[35], w[36], offset);
      w[53] = hc_bytealign (w[34], w[35], offset);
      w[52] = hc_bytealign (w[33], w[34], offset);
      w[51] = hc_bytealign (w[32], w[33], offset);
      w[50] = hc_bytealign (w[31], w[32], offset);
      w[49] = hc_bytealign (w[30], w[31], offset);
      w[48] = hc_bytealign (w[29], w[30], offset);
      w[47] = hc_bytealign (w[28], w[29], offset);
      w[46] = hc_bytealign (w[27], w[28], offset);
      w[45] = hc_bytealign (w[26], w[27], offset);
      w[44] = hc_bytealign (w[25], w[26], offset);
      w[43] = hc_bytealign (w[24], w[25], offset);
      w[42] = hc_bytealign (w[23], w[24], offset);
      w[41] = hc_bytealign (w[22], w[23], offset);
      w[40] = hc_bytealign (w[21], w[22], offset);
      w[39] = hc_bytealign (w[20], w[21], offset);
      w[38] = hc_bytealign (w[19], w[20], offset);
      w[37] = hc_bytealign (w[18], w[19], offset);
      w[36] = hc_bytealign (w[17], w[18], offset);
      w[35] = hc_bytealign (w[16], w[17], offset);
      w[34] = hc_bytealign (w[15], w[16], offset);
      w[33] = hc_bytealign (w[14], w[15], offset);
      w[32] = hc_bytealign (w[13], w[14], offset);
      w[31] = hc_bytealign (w[12], w[13], offset);
      w[30] = hc_bytealign (w[11], w[12], offset);
      w[29] = hc_bytealign (w[10], w[11], offset);
      w[28] = hc_bytealign (w[ 9], w[10], offset);
      w[27] = hc_bytealign (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign (w[43], w[44], offset);
      w[62] = hc_bytealign (w[42], w[43], offset);
      w[61] = hc_bytealign (w[41], w[42], offset);
      w[60] = hc_bytealign (w[40], w[41], offset);
      w[59] = hc_bytealign (w[39], w[40], offset);
      w[58] = hc_bytealign (w[38], w[39], offset);
      w[57] = hc_bytealign (w[37], w[38], offset);
      w[56] = hc_bytealign (w[36], w[37], offset);
      w[55] = hc_bytealign (w[35], w[36], offset);
      w[54] = hc_bytealign (w[34], w[35], offset);
      w[53] = hc_bytealign (w[33], w[34], offset);
      w[52] = hc_bytealign (w[32], w[33], offset);
      w[51] = hc_bytealign (w[31], w[32], offset);
      w[50] = hc_bytealign (w[30], w[31], offset);
      w[49] = hc_bytealign (w[29], w[30], offset);
      w[48] = hc_bytealign (w[28], w[29], offset);
      w[47] = hc_bytealign (w[27], w[28], offset);
      w[46] = hc_bytealign (w[26], w[27], offset);
      w[45] = hc_bytealign (w[25], w[26], offset);
      w[44] = hc_bytealign (w[24], w[25], offset);
      w[43] = hc_bytealign (w[23], w[24], offset);
      w[42] = hc_bytealign (w[22], w[23], offset);
      w[41] = hc_bytealign (w[21], w[22], offset);
      w[40] = hc_bytealign (w[20], w[21], offset);
      w[39] = hc_bytealign (w[19], w[20], offset);
      w[38] = hc_bytealign (w[18], w[19], offset);
      w[37] = hc_bytealign (w[17], w[18], offset);
      w[36] = hc_bytealign (w[16], w[17], offset);
      w[35] = hc_bytealign (w[15], w[16], offset);
      w[34] = hc_bytealign (w[14], w[15], offset);
      w[33] = hc_bytealign (w[13], w[14], offset);
      w[32] = hc_bytealign (w[12], w[13], offset);
      w[31] = hc_bytealign (w[11], w[12], offset);
      w[30] = hc_bytealign (w[10], w[11], offset);
      w[29] = hc_bytealign (w[ 9], w[10], offset);
      w[28] = hc_bytealign (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign (w[42], w[43], offset);
      w[62] = hc_bytealign (w[41], w[42], offset);
      w[61] = hc_bytealign (w[40], w[41], offset);
      w[60] = hc_bytealign (w[39], w[40], offset);
      w[59] = hc_bytealign (w[38], w[39], offset);
      w[58] = hc_bytealign (w[37], w[38], offset);
      w[57] = hc_bytealign (w[36], w[37], offset);
      w[56] = hc_bytealign (w[35], w[36], offset);
      w[55] = hc_bytealign (w[34], w[35], offset);
      w[54] = hc_bytealign (w[33], w[34], offset);
      w[53] = hc_bytealign (w[32], w[33], offset);
      w[52] = hc_bytealign (w[31], w[32], offset);
      w[51] = hc_bytealign (w[30], w[31], offset);
      w[50] = hc_bytealign (w[29], w[30], offset);
      w[49] = hc_bytealign (w[28], w[29], offset);
      w[48] = hc_bytealign (w[27], w[28], offset);
      w[47] = hc_bytealign (w[26], w[27], offset);
      w[46] = hc_bytealign (w[25], w[26], offset);
      w[45] = hc_bytealign (w[24], w[25], offset);
      w[44] = hc_bytealign (w[23], w[24], offset);
      w[43] = hc_bytealign (w[22], w[23], offset);
      w[42] = hc_bytealign (w[21], w[22], offset);
      w[41] = hc_bytealign (w[20], w[21], offset);
      w[40] = hc_bytealign (w[19], w[20], offset);
      w[39] = hc_bytealign (w[18], w[19], offset);
      w[38] = hc_bytealign (w[17], w[18], offset);
      w[37] = hc_bytealign (w[16], w[17], offset);
      w[36] = hc_bytealign (w[15], w[16], offset);
      w[35] = hc_bytealign (w[14], w[15], offset);
      w[34] = hc_bytealign (w[13], w[14], offset);
      w[33] = hc_bytealign (w[12], w[13], offset);
      w[32] = hc_bytealign (w[11], w[12], offset);
      w[31] = hc_bytealign (w[10], w[11], offset);
      w[30] = hc_bytealign (w[ 9], w[10], offset);
      w[29] = hc_bytealign (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign (w[41], w[42], offset);
      w[62] = hc_bytealign (w[40], w[41], offset);
      w[61] = hc_bytealign (w[39], w[40], offset);
      w[60] = hc_bytealign (w[38], w[39], offset);
      w[59] = hc_bytealign (w[37], w[38], offset);
      w[58] = hc_bytealign (w[36], w[37], offset);
      w[57] = hc_bytealign (w[35], w[36], offset);
      w[56] = hc_bytealign (w[34], w[35], offset);
      w[55] = hc_bytealign (w[33], w[34], offset);
      w[54] = hc_bytealign (w[32], w[33], offset);
      w[53] = hc_bytealign (w[31], w[32], offset);
      w[52] = hc_bytealign (w[30], w[31], offset);
      w[51] = hc_bytealign (w[29], w[30], offset);
      w[50] = hc_bytealign (w[28], w[29], offset);
      w[49] = hc_bytealign (w[27], w[28], offset);
      w[48] = hc_bytealign (w[26], w[27], offset);
      w[47] = hc_bytealign (w[25], w[26], offset);
      w[46] = hc_bytealign (w[24], w[25], offset);
      w[45] = hc_bytealign (w[23], w[24], offset);
      w[44] = hc_bytealign (w[22], w[23], offset);
      w[43] = hc_bytealign (w[21], w[22], offset);
      w[42] = hc_bytealign (w[20], w[21], offset);
      w[41] = hc_bytealign (w[19], w[20], offset);
      w[40] = hc_bytealign (w[18], w[19], offset);
      w[39] = hc_bytealign (w[17], w[18], offset);
      w[38] = hc_bytealign (w[16], w[17], offset);
      w[37] = hc_bytealign (w[15], w[16], offset);
      w[36] = hc_bytealign (w[14], w[15], offset);
      w[35] = hc_bytealign (w[13], w[14], offset);
      w[34] = hc_bytealign (w[12], w[13], offset);
      w[33] = hc_bytealign (w[11], w[12], offset);
      w[32] = hc_bytealign (w[10], w[11], offset);
      w[31] = hc_bytealign (w[ 9], w[10], offset);
      w[30] = hc_bytealign (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign (w[40], w[41], offset);
      w[62] = hc_bytealign (w[39], w[40], offset);
      w[61] = hc_bytealign (w[38], w[39], offset);
      w[60] = hc_bytealign (w[37], w[38], offset);
      w[59] = hc_bytealign (w[36], w[37], offset);
      w[58] = hc_bytealign (w[35], w[36], offset);
      w[57] = hc_bytealign (w[34], w[35], offset);
      w[56] = hc_bytealign (w[33], w[34], offset);
      w[55] = hc_bytealign (w[32], w[33], offset);
      w[54] = hc_bytealign (w[31], w[32], offset);
      w[53] = hc_bytealign (w[30], w[31], offset);
      w[52] = hc_bytealign (w[29], w[30], offset);
      w[51] = hc_bytealign (w[28], w[29], offset);
      w[50] = hc_bytealign (w[27], w[28], offset);
      w[49] = hc_bytealign (w[26], w[27], offset);
      w[48] = hc_bytealign (w[25], w[26], offset);
      w[47] = hc_bytealign (w[24], w[25], offset);
      w[46] = hc_bytealign (w[23], w[24], offset);
      w[45] = hc_bytealign (w[22], w[23], offset);
      w[44] = hc_bytealign (w[21], w[22], offset);
      w[43] = hc_bytealign (w[20], w[21], offset);
      w[42] = hc_bytealign (w[19], w[20], offset);
      w[41] = hc_bytealign (w[18], w[19], offset);
      w[40] = hc_bytealign (w[17], w[18], offset);
      w[39] = hc_bytealign (w[16], w[17], offset);
      w[38] = hc_bytealign (w[15], w[16], offset);
      w[37] = hc_bytealign (w[14], w[15], offset);
      w[36] = hc_bytealign (w[13], w[14], offset);
      w[35] = hc_bytealign (w[12], w[13], offset);
      w[34] = hc_bytealign (w[11], w[12], offset);
      w[33] = hc_bytealign (w[10], w[11], offset);
      w[32] = hc_bytealign (w[ 9], w[10], offset);
      w[31] = hc_bytealign (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign (w[39], w[40], offset);
      w[62] = hc_bytealign (w[38], w[39], offset);
      w[61] = hc_bytealign (w[37], w[38], offset);
      w[60] = hc_bytealign (w[36], w[37], offset);
      w[59] = hc_bytealign (w[35], w[36], offset);
      w[58] = hc_bytealign (w[34], w[35], offset);
      w[57] = hc_bytealign (w[33], w[34], offset);
      w[56] = hc_bytealign (w[32], w[33], offset);
      w[55] = hc_bytealign (w[31], w[32], offset);
      w[54] = hc_bytealign (w[30], w[31], offset);
      w[53] = hc_bytealign (w[29], w[30], offset);
      w[52] = hc_bytealign (w[28], w[29], offset);
      w[51] = hc_bytealign (w[27], w[28], offset);
      w[50] = hc_bytealign (w[26], w[27], offset);
      w[49] = hc_bytealign (w[25], w[26], offset);
      w[48] = hc_bytealign (w[24], w[25], offset);
      w[47] = hc_bytealign (w[23], w[24], offset);
      w[46] = hc_bytealign (w[22], w[23], offset);
      w[45] = hc_bytealign (w[21], w[22], offset);
      w[44] = hc_bytealign (w[20], w[21], offset);
      w[43] = hc_bytealign (w[19], w[20], offset);
      w[42] = hc_bytealign (w[18], w[19], offset);
      w[41] = hc_bytealign (w[17], w[18], offset);
      w[40] = hc_bytealign (w[16], w[17], offset);
      w[39] = hc_bytealign (w[15], w[16], offset);
      w[38] = hc_bytealign (w[14], w[15], offset);
      w[37] = hc_bytealign (w[13], w[14], offset);
      w[36] = hc_bytealign (w[12], w[13], offset);
      w[35] = hc_bytealign (w[11], w[12], offset);
      w[34] = hc_bytealign (w[10], w[11], offset);
      w[33] = hc_bytealign (w[ 9], w[10], offset);
      w[32] = hc_bytealign (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign (w[38], w[39], offset);
      w[62] = hc_bytealign (w[37], w[38], offset);
      w[61] = hc_bytealign (w[36], w[37], offset);
      w[60] = hc_bytealign (w[35], w[36], offset);
      w[59] = hc_bytealign (w[34], w[35], offset);
      w[58] = hc_bytealign (w[33], w[34], offset);
      w[57] = hc_bytealign (w[32], w[33], offset);
      w[56] = hc_bytealign (w[31], w[32], offset);
      w[55] = hc_bytealign (w[30], w[31], offset);
      w[54] = hc_bytealign (w[29], w[30], offset);
      w[53] = hc_bytealign (w[28], w[29], offset);
      w[52] = hc_bytealign (w[27], w[28], offset);
      w[51] = hc_bytealign (w[26], w[27], offset);
      w[50] = hc_bytealign (w[25], w[26], offset);
      w[49] = hc_bytealign (w[24], w[25], offset);
      w[48] = hc_bytealign (w[23], w[24], offset);
      w[47] = hc_bytealign (w[22], w[23], offset);
      w[46] = hc_bytealign (w[21], w[22], offset);
      w[45] = hc_bytealign (w[20], w[21], offset);
      w[44] = hc_bytealign (w[19], w[20], offset);
      w[43] = hc_bytealign (w[18], w[19], offset);
      w[42] = hc_bytealign (w[17], w[18], offset);
      w[41] = hc_bytealign (w[16], w[17], offset);
      w[40] = hc_bytealign (w[15], w[16], offset);
      w[39] = hc_bytealign (w[14], w[15], offset);
      w[38] = hc_bytealign (w[13], w[14], offset);
      w[37] = hc_bytealign (w[12], w[13], offset);
      w[36] = hc_bytealign (w[11], w[12], offset);
      w[35] = hc_bytealign (w[10], w[11], offset);
      w[34] = hc_bytealign (w[ 9], w[10], offset);
      w[33] = hc_bytealign (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign (w[37], w[38], offset);
      w[62] = hc_bytealign (w[36], w[37], offset);
      w[61] = hc_bytealign (w[35], w[36], offset);
      w[60] = hc_bytealign (w[34], w[35], offset);
      w[59] = hc_bytealign (w[33], w[34], offset);
      w[58] = hc_bytealign (w[32], w[33], offset);
      w[57] = hc_bytealign (w[31], w[32], offset);
      w[56] = hc_bytealign (w[30], w[31], offset);
      w[55] = hc_bytealign (w[29], w[30], offset);
      w[54] = hc_bytealign (w[28], w[29], offset);
      w[53] = hc_bytealign (w[27], w[28], offset);
      w[52] = hc_bytealign (w[26], w[27], offset);
      w[51] = hc_bytealign (w[25], w[26], offset);
      w[50] = hc_bytealign (w[24], w[25], offset);
      w[49] = hc_bytealign (w[23], w[24], offset);
      w[48] = hc_bytealign (w[22], w[23], offset);
      w[47] = hc_bytealign (w[21], w[22], offset);
      w[46] = hc_bytealign (w[20], w[21], offset);
      w[45] = hc_bytealign (w[19], w[20], offset);
      w[44] = hc_bytealign (w[18], w[19], offset);
      w[43] = hc_bytealign (w[17], w[18], offset);
      w[42] = hc_bytealign (w[16], w[17], offset);
      w[41] = hc_bytealign (w[15], w[16], offset);
      w[40] = hc_bytealign (w[14], w[15], offset);
      w[39] = hc_bytealign (w[13], w[14], offset);
      w[38] = hc_bytealign (w[12], w[13], offset);
      w[37] = hc_bytealign (w[11], w[12], offset);
      w[36] = hc_bytealign (w[10], w[11], offset);
      w[35] = hc_bytealign (w[ 9], w[10], offset);
      w[34] = hc_bytealign (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign (w[36], w[37], offset);
      w[62] = hc_bytealign (w[35], w[36], offset);
      w[61] = hc_bytealign (w[34], w[35], offset);
      w[60] = hc_bytealign (w[33], w[34], offset);
      w[59] = hc_bytealign (w[32], w[33], offset);
      w[58] = hc_bytealign (w[31], w[32], offset);
      w[57] = hc_bytealign (w[30], w[31], offset);
      w[56] = hc_bytealign (w[29], w[30], offset);
      w[55] = hc_bytealign (w[28], w[29], offset);
      w[54] = hc_bytealign (w[27], w[28], offset);
      w[53] = hc_bytealign (w[26], w[27], offset);
      w[52] = hc_bytealign (w[25], w[26], offset);
      w[51] = hc_bytealign (w[24], w[25], offset);
      w[50] = hc_bytealign (w[23], w[24], offset);
      w[49] = hc_bytealign (w[22], w[23], offset);
      w[48] = hc_bytealign (w[21], w[22], offset);
      w[47] = hc_bytealign (w[20], w[21], offset);
      w[46] = hc_bytealign (w[19], w[20], offset);
      w[45] = hc_bytealign (w[18], w[19], offset);
      w[44] = hc_bytealign (w[17], w[18], offset);
      w[43] = hc_bytealign (w[16], w[17], offset);
      w[42] = hc_bytealign (w[15], w[16], offset);
      w[41] = hc_bytealign (w[14], w[15], offset);
      w[40] = hc_bytealign (w[13], w[14], offset);
      w[39] = hc_bytealign (w[12], w[13], offset);
      w[38] = hc_bytealign (w[11], w[12], offset);
      w[37] = hc_bytealign (w[10], w[11], offset);
      w[36] = hc_bytealign (w[ 9], w[10], offset);
      w[35] = hc_bytealign (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign (w[35], w[36], offset);
      w[62] = hc_bytealign (w[34], w[35], offset);
      w[61] = hc_bytealign (w[33], w[34], offset);
      w[60] = hc_bytealign (w[32], w[33], offset);
      w[59] = hc_bytealign (w[31], w[32], offset);
      w[58] = hc_bytealign (w[30], w[31], offset);
      w[57] = hc_bytealign (w[29], w[30], offset);
      w[56] = hc_bytealign (w[28], w[29], offset);
      w[55] = hc_bytealign (w[27], w[28], offset);
      w[54] = hc_bytealign (w[26], w[27], offset);
      w[53] = hc_bytealign (w[25], w[26], offset);
      w[52] = hc_bytealign (w[24], w[25], offset);
      w[51] = hc_bytealign (w[23], w[24], offset);
      w[50] = hc_bytealign (w[22], w[23], offset);
      w[49] = hc_bytealign (w[21], w[22], offset);
      w[48] = hc_bytealign (w[20], w[21], offset);
      w[47] = hc_bytealign (w[19], w[20], offset);
      w[46] = hc_bytealign (w[18], w[19], offset);
      w[45] = hc_bytealign (w[17], w[18], offset);
      w[44] = hc_bytealign (w[16], w[17], offset);
      w[43] = hc_bytealign (w[15], w[16], offset);
      w[42] = hc_bytealign (w[14], w[15], offset);
      w[41] = hc_bytealign (w[13], w[14], offset);
      w[40] = hc_bytealign (w[12], w[13], offset);
      w[39] = hc_bytealign (w[11], w[12], offset);
      w[38] = hc_bytealign (w[10], w[11], offset);
      w[37] = hc_bytealign (w[ 9], w[10], offset);
      w[36] = hc_bytealign (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign (w[34], w[35], offset);
      w[62] = hc_bytealign (w[33], w[34], offset);
      w[61] = hc_bytealign (w[32], w[33], offset);
      w[60] = hc_bytealign (w[31], w[32], offset);
      w[59] = hc_bytealign (w[30], w[31], offset);
      w[58] = hc_bytealign (w[29], w[30], offset);
      w[57] = hc_bytealign (w[28], w[29], offset);
      w[56] = hc_bytealign (w[27], w[28], offset);
      w[55] = hc_bytealign (w[26], w[27], offset);
      w[54] = hc_bytealign (w[25], w[26], offset);
      w[53] = hc_bytealign (w[24], w[25], offset);
      w[52] = hc_bytealign (w[23], w[24], offset);
      w[51] = hc_bytealign (w[22], w[23], offset);
      w[50] = hc_bytealign (w[21], w[22], offset);
      w[49] = hc_bytealign (w[20], w[21], offset);
      w[48] = hc_bytealign (w[19], w[20], offset);
      w[47] = hc_bytealign (w[18], w[19], offset);
      w[46] = hc_bytealign (w[17], w[18], offset);
      w[45] = hc_bytealign (w[16], w[17], offset);
      w[44] = hc_bytealign (w[15], w[16], offset);
      w[43] = hc_bytealign (w[14], w[15], offset);
      w[42] = hc_bytealign (w[13], w[14], offset);
      w[41] = hc_bytealign (w[12], w[13], offset);
      w[40] = hc_bytealign (w[11], w[12], offset);
      w[39] = hc_bytealign (w[10], w[11], offset);
      w[38] = hc_bytealign (w[ 9], w[10], offset);
      w[37] = hc_bytealign (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign (w[33], w[34], offset);
      w[62] = hc_bytealign (w[32], w[33], offset);
      w[61] = hc_bytealign (w[31], w[32], offset);
      w[60] = hc_bytealign (w[30], w[31], offset);
      w[59] = hc_bytealign (w[29], w[30], offset);
      w[58] = hc_bytealign (w[28], w[29], offset);
      w[57] = hc_bytealign (w[27], w[28], offset);
      w[56] = hc_bytealign (w[26], w[27], offset);
      w[55] = hc_bytealign (w[25], w[26], offset);
      w[54] = hc_bytealign (w[24], w[25], offset);
      w[53] = hc_bytealign (w[23], w[24], offset);
      w[52] = hc_bytealign (w[22], w[23], offset);
      w[51] = hc_bytealign (w[21], w[22], offset);
      w[50] = hc_bytealign (w[20], w[21], offset);
      w[49] = hc_bytealign (w[19], w[20], offset);
      w[48] = hc_bytealign (w[18], w[19], offset);
      w[47] = hc_bytealign (w[17], w[18], offset);
      w[46] = hc_bytealign (w[16], w[17], offset);
      w[45] = hc_bytealign (w[15], w[16], offset);
      w[44] = hc_bytealign (w[14], w[15], offset);
      w[43] = hc_bytealign (w[13], w[14], offset);
      w[42] = hc_bytealign (w[12], w[13], offset);
      w[41] = hc_bytealign (w[11], w[12], offset);
      w[40] = hc_bytealign (w[10], w[11], offset);
      w[39] = hc_bytealign (w[ 9], w[10], offset);
      w[38] = hc_bytealign (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign (w[32], w[33], offset);
      w[62] = hc_bytealign (w[31], w[32], offset);
      w[61] = hc_bytealign (w[30], w[31], offset);
      w[60] = hc_bytealign (w[29], w[30], offset);
      w[59] = hc_bytealign (w[28], w[29], offset);
      w[58] = hc_bytealign (w[27], w[28], offset);
      w[57] = hc_bytealign (w[26], w[27], offset);
      w[56] = hc_bytealign (w[25], w[26], offset);
      w[55] = hc_bytealign (w[24], w[25], offset);
      w[54] = hc_bytealign (w[23], w[24], offset);
      w[53] = hc_bytealign (w[22], w[23], offset);
      w[52] = hc_bytealign (w[21], w[22], offset);
      w[51] = hc_bytealign (w[20], w[21], offset);
      w[50] = hc_bytealign (w[19], w[20], offset);
      w[49] = hc_bytealign (w[18], w[19], offset);
      w[48] = hc_bytealign (w[17], w[18], offset);
      w[47] = hc_bytealign (w[16], w[17], offset);
      w[46] = hc_bytealign (w[15], w[16], offset);
      w[45] = hc_bytealign (w[14], w[15], offset);
      w[44] = hc_bytealign (w[13], w[14], offset);
      w[43] = hc_bytealign (w[12], w[13], offset);
      w[42] = hc_bytealign (w[11], w[12], offset);
      w[41] = hc_bytealign (w[10], w[11], offset);
      w[40] = hc_bytealign (w[ 9], w[10], offset);
      w[39] = hc_bytealign (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign (w[31], w[32], offset);
      w[62] = hc_bytealign (w[30], w[31], offset);
      w[61] = hc_bytealign (w[29], w[30], offset);
      w[60] = hc_bytealign (w[28], w[29], offset);
      w[59] = hc_bytealign (w[27], w[28], offset);
      w[58] = hc_bytealign (w[26], w[27], offset);
      w[57] = hc_bytealign (w[25], w[26], offset);
      w[56] = hc_bytealign (w[24], w[25], offset);
      w[55] = hc_bytealign (w[23], w[24], offset);
      w[54] = hc_bytealign (w[22], w[23], offset);
      w[53] = hc_bytealign (w[21], w[22], offset);
      w[52] = hc_bytealign (w[20], w[21], offset);
      w[51] = hc_bytealign (w[19], w[20], offset);
      w[50] = hc_bytealign (w[18], w[19], offset);
      w[49] = hc_bytealign (w[17], w[18], offset);
      w[48] = hc_bytealign (w[16], w[17], offset);
      w[47] = hc_bytealign (w[15], w[16], offset);
      w[46] = hc_bytealign (w[14], w[15], offset);
      w[45] = hc_bytealign (w[13], w[14], offset);
      w[44] = hc_bytealign (w[12], w[13], offset);
      w[43] = hc_bytealign (w[11], w[12], offset);
      w[42] = hc_bytealign (w[10], w[11], offset);
      w[41] = hc_bytealign (w[ 9], w[10], offset);
      w[40] = hc_bytealign (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign (w[30], w[31], offset);
      w[62] = hc_bytealign (w[29], w[30], offset);
      w[61] = hc_bytealign (w[28], w[29], offset);
      w[60] = hc_bytealign (w[27], w[28], offset);
      w[59] = hc_bytealign (w[26], w[27], offset);
      w[58] = hc_bytealign (w[25], w[26], offset);
      w[57] = hc_bytealign (w[24], w[25], offset);
      w[56] = hc_bytealign (w[23], w[24], offset);
      w[55] = hc_bytealign (w[22], w[23], offset);
      w[54] = hc_bytealign (w[21], w[22], offset);
      w[53] = hc_bytealign (w[20], w[21], offset);
      w[52] = hc_bytealign (w[19], w[20], offset);
      w[51] = hc_bytealign (w[18], w[19], offset);
      w[50] = hc_bytealign (w[17], w[18], offset);
      w[49] = hc_bytealign (w[16], w[17], offset);
      w[48] = hc_bytealign (w[15], w[16], offset);
      w[47] = hc_bytealign (w[14], w[15], offset);
      w[46] = hc_bytealign (w[13], w[14], offset);
      w[45] = hc_bytealign (w[12], w[13], offset);
      w[44] = hc_bytealign (w[11], w[12], offset);
      w[43] = hc_bytealign (w[10], w[11], offset);
      w[42] = hc_bytealign (w[ 9], w[10], offset);
      w[41] = hc_bytealign (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign (w[29], w[30], offset);
      w[62] = hc_bytealign (w[28], w[29], offset);
      w[61] = hc_bytealign (w[27], w[28], offset);
      w[60] = hc_bytealign (w[26], w[27], offset);
      w[59] = hc_bytealign (w[25], w[26], offset);
      w[58] = hc_bytealign (w[24], w[25], offset);
      w[57] = hc_bytealign (w[23], w[24], offset);
      w[56] = hc_bytealign (w[22], w[23], offset);
      w[55] = hc_bytealign (w[21], w[22], offset);
      w[54] = hc_bytealign (w[20], w[21], offset);
      w[53] = hc_bytealign (w[19], w[20], offset);
      w[52] = hc_bytealign (w[18], w[19], offset);
      w[51] = hc_bytealign (w[17], w[18], offset);
      w[50] = hc_bytealign (w[16], w[17], offset);
      w[49] = hc_bytealign (w[15], w[16], offset);
      w[48] = hc_bytealign (w[14], w[15], offset);
      w[47] = hc_bytealign (w[13], w[14], offset);
      w[46] = hc_bytealign (w[12], w[13], offset);
      w[45] = hc_bytealign (w[11], w[12], offset);
      w[44] = hc_bytealign (w[10], w[11], offset);
      w[43] = hc_bytealign (w[ 9], w[10], offset);
      w[42] = hc_bytealign (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign (w[28], w[29], offset);
      w[62] = hc_bytealign (w[27], w[28], offset);
      w[61] = hc_bytealign (w[26], w[27], offset);
      w[60] = hc_bytealign (w[25], w[26], offset);
      w[59] = hc_bytealign (w[24], w[25], offset);
      w[58] = hc_bytealign (w[23], w[24], offset);
      w[57] = hc_bytealign (w[22], w[23], offset);
      w[56] = hc_bytealign (w[21], w[22], offset);
      w[55] = hc_bytealign (w[20], w[21], offset);
      w[54] = hc_bytealign (w[19], w[20], offset);
      w[53] = hc_bytealign (w[18], w[19], offset);
      w[52] = hc_bytealign (w[17], w[18], offset);
      w[51] = hc_bytealign (w[16], w[17], offset);
      w[50] = hc_bytealign (w[15], w[16], offset);
      w[49] = hc_bytealign (w[14], w[15], offset);
      w[48] = hc_bytealign (w[13], w[14], offset);
      w[47] = hc_bytealign (w[12], w[13], offset);
      w[46] = hc_bytealign (w[11], w[12], offset);
      w[45] = hc_bytealign (w[10], w[11], offset);
      w[44] = hc_bytealign (w[ 9], w[10], offset);
      w[43] = hc_bytealign (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign (w[27], w[28], offset);
      w[62] = hc_bytealign (w[26], w[27], offset);
      w[61] = hc_bytealign (w[25], w[26], offset);
      w[60] = hc_bytealign (w[24], w[25], offset);
      w[59] = hc_bytealign (w[23], w[24], offset);
      w[58] = hc_bytealign (w[22], w[23], offset);
      w[57] = hc_bytealign (w[21], w[22], offset);
      w[56] = hc_bytealign (w[20], w[21], offset);
      w[55] = hc_bytealign (w[19], w[20], offset);
      w[54] = hc_bytealign (w[18], w[19], offset);
      w[53] = hc_bytealign (w[17], w[18], offset);
      w[52] = hc_bytealign (w[16], w[17], offset);
      w[51] = hc_bytealign (w[15], w[16], offset);
      w[50] = hc_bytealign (w[14], w[15], offset);
      w[49] = hc_bytealign (w[13], w[14], offset);
      w[48] = hc_bytealign (w[12], w[13], offset);
      w[47] = hc_bytealign (w[11], w[12], offset);
      w[46] = hc_bytealign (w[10], w[11], offset);
      w[45] = hc_bytealign (w[ 9], w[10], offset);
      w[44] = hc_bytealign (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign (w[26], w[27], offset);
      w[62] = hc_bytealign (w[25], w[26], offset);
      w[61] = hc_bytealign (w[24], w[25], offset);
      w[60] = hc_bytealign (w[23], w[24], offset);
      w[59] = hc_bytealign (w[22], w[23], offset);
      w[58] = hc_bytealign (w[21], w[22], offset);
      w[57] = hc_bytealign (w[20], w[21], offset);
      w[56] = hc_bytealign (w[19], w[20], offset);
      w[55] = hc_bytealign (w[18], w[19], offset);
      w[54] = hc_bytealign (w[17], w[18], offset);
      w[53] = hc_bytealign (w[16], w[17], offset);
      w[52] = hc_bytealign (w[15], w[16], offset);
      w[51] = hc_bytealign (w[14], w[15], offset);
      w[50] = hc_bytealign (w[13], w[14], offset);
      w[49] = hc_bytealign (w[12], w[13], offset);
      w[48] = hc_bytealign (w[11], w[12], offset);
      w[47] = hc_bytealign (w[10], w[11], offset);
      w[46] = hc_bytealign (w[ 9], w[10], offset);
      w[45] = hc_bytealign (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign (w[25], w[26], offset);
      w[62] = hc_bytealign (w[24], w[25], offset);
      w[61] = hc_bytealign (w[23], w[24], offset);
      w[60] = hc_bytealign (w[22], w[23], offset);
      w[59] = hc_bytealign (w[21], w[22], offset);
      w[58] = hc_bytealign (w[20], w[21], offset);
      w[57] = hc_bytealign (w[19], w[20], offset);
      w[56] = hc_bytealign (w[18], w[19], offset);
      w[55] = hc_bytealign (w[17], w[18], offset);
      w[54] = hc_bytealign (w[16], w[17], offset);
      w[53] = hc_bytealign (w[15], w[16], offset);
      w[52] = hc_bytealign (w[14], w[15], offset);
      w[51] = hc_bytealign (w[13], w[14], offset);
      w[50] = hc_bytealign (w[12], w[13], offset);
      w[49] = hc_bytealign (w[11], w[12], offset);
      w[48] = hc_bytealign (w[10], w[11], offset);
      w[47] = hc_bytealign (w[ 9], w[10], offset);
      w[46] = hc_bytealign (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign (w[24], w[25], offset);
      w[62] = hc_bytealign (w[23], w[24], offset);
      w[61] = hc_bytealign (w[22], w[23], offset);
      w[60] = hc_bytealign (w[21], w[22], offset);
      w[59] = hc_bytealign (w[20], w[21], offset);
      w[58] = hc_bytealign (w[19], w[20], offset);
      w[57] = hc_bytealign (w[18], w[19], offset);
      w[56] = hc_bytealign (w[17], w[18], offset);
      w[55] = hc_bytealign (w[16], w[17], offset);
      w[54] = hc_bytealign (w[15], w[16], offset);
      w[53] = hc_bytealign (w[14], w[15], offset);
      w[52] = hc_bytealign (w[13], w[14], offset);
      w[51] = hc_bytealign (w[12], w[13], offset);
      w[50] = hc_bytealign (w[11], w[12], offset);
      w[49] = hc_bytealign (w[10], w[11], offset);
      w[48] = hc_bytealign (w[ 9], w[10], offset);
      w[47] = hc_bytealign (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign (w[23], w[24], offset);
      w[62] = hc_bytealign (w[22], w[23], offset);
      w[61] = hc_bytealign (w[21], w[22], offset);
      w[60] = hc_bytealign (w[20], w[21], offset);
      w[59] = hc_bytealign (w[19], w[20], offset);
      w[58] = hc_bytealign (w[18], w[19], offset);
      w[57] = hc_bytealign (w[17], w[18], offset);
      w[56] = hc_bytealign (w[16], w[17], offset);
      w[55] = hc_bytealign (w[15], w[16], offset);
      w[54] = hc_bytealign (w[14], w[15], offset);
      w[53] = hc_bytealign (w[13], w[14], offset);
      w[52] = hc_bytealign (w[12], w[13], offset);
      w[51] = hc_bytealign (w[11], w[12], offset);
      w[50] = hc_bytealign (w[10], w[11], offset);
      w[49] = hc_bytealign (w[ 9], w[10], offset);
      w[48] = hc_bytealign (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign (w[22], w[23], offset);
      w[62] = hc_bytealign (w[21], w[22], offset);
      w[61] = hc_bytealign (w[20], w[21], offset);
      w[60] = hc_bytealign (w[19], w[20], offset);
      w[59] = hc_bytealign (w[18], w[19], offset);
      w[58] = hc_bytealign (w[17], w[18], offset);
      w[57] = hc_bytealign (w[16], w[17], offset);
      w[56] = hc_bytealign (w[15], w[16], offset);
      w[55] = hc_bytealign (w[14], w[15], offset);
      w[54] = hc_bytealign (w[13], w[14], offset);
      w[53] = hc_bytealign (w[12], w[13], offset);
      w[52] = hc_bytealign (w[11], w[12], offset);
      w[51] = hc_bytealign (w[10], w[11], offset);
      w[50] = hc_bytealign (w[ 9], w[10], offset);
      w[49] = hc_bytealign (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign (w[21], w[22], offset);
      w[62] = hc_bytealign (w[20], w[21], offset);
      w[61] = hc_bytealign (w[19], w[20], offset);
      w[60] = hc_bytealign (w[18], w[19], offset);
      w[59] = hc_bytealign (w[17], w[18], offset);
      w[58] = hc_bytealign (w[16], w[17], offset);
      w[57] = hc_bytealign (w[15], w[16], offset);
      w[56] = hc_bytealign (w[14], w[15], offset);
      w[55] = hc_bytealign (w[13], w[14], offset);
      w[54] = hc_bytealign (w[12], w[13], offset);
      w[53] = hc_bytealign (w[11], w[12], offset);
      w[52] = hc_bytealign (w[10], w[11], offset);
      w[51] = hc_bytealign (w[ 9], w[10], offset);
      w[50] = hc_bytealign (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign (w[20], w[21], offset);
      w[62] = hc_bytealign (w[19], w[20], offset);
      w[61] = hc_bytealign (w[18], w[19], offset);
      w[60] = hc_bytealign (w[17], w[18], offset);
      w[59] = hc_bytealign (w[16], w[17], offset);
      w[58] = hc_bytealign (w[15], w[16], offset);
      w[57] = hc_bytealign (w[14], w[15], offset);
      w[56] = hc_bytealign (w[13], w[14], offset);
      w[55] = hc_bytealign (w[12], w[13], offset);
      w[54] = hc_bytealign (w[11], w[12], offset);
      w[53] = hc_bytealign (w[10], w[11], offset);
      w[52] = hc_bytealign (w[ 9], w[10], offset);
      w[51] = hc_bytealign (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign (w[19], w[20], offset);
      w[62] = hc_bytealign (w[18], w[19], offset);
      w[61] = hc_bytealign (w[17], w[18], offset);
      w[60] = hc_bytealign (w[16], w[17], offset);
      w[59] = hc_bytealign (w[15], w[16], offset);
      w[58] = hc_bytealign (w[14], w[15], offset);
      w[57] = hc_bytealign (w[13], w[14], offset);
      w[56] = hc_bytealign (w[12], w[13], offset);
      w[55] = hc_bytealign (w[11], w[12], offset);
      w[54] = hc_bytealign (w[10], w[11], offset);
      w[53] = hc_bytealign (w[ 9], w[10], offset);
      w[52] = hc_bytealign (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign (w[ 1], w[ 2], offset);
      w[44] = hc_bytealign (w[ 0], w[ 1], offset);
      w[43] = hc_bytealign (    0, w[ 0], offset);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_bytealign (w[18], w[19], offset);
      w[62] = hc_bytealign (w[17], w[18], offset);
      w[61] = hc_bytealign (w[16], w[17], offset);
      w[60] = hc_bytealign (w[15], w[16], offset);
      w[59] = hc_bytealign (w[14], w[15], offset);
      w[58] = hc_bytealign (w[13], w[14], offset);
      w[57] = hc_bytealign (w[12], w[13], offset);
      w[56] = hc_bytealign (w[11], w[12], offset);
      w[55] = hc_bytealign (w[10], w[11], offset);
      w[54] = hc_bytealign (w[ 9], w[10], offset);
      w[53] = hc_bytealign (w[ 8], w[ 9], offset);
      w[52] = hc_bytealign (w[ 7], w[ 8], offset);
      w[51] = hc_bytealign (w[ 6], w[ 7], offset);
      w[50] = hc_bytealign (w[ 5], w[ 6], offset);
      w[49] = hc_bytealign (w[ 4], w[ 5], offset);
      w[48] = hc_bytealign (w[ 3], w[ 4], offset);
      w[47] = hc_bytealign (w[ 2], w[ 3], offset);
      w[46] = hc_bytealign (w[ 1], w[ 2], offset);
      w[45] = hc_bytealign (w[ 0], w[ 1], offset);
      w[44] = hc_bytealign (    0, w[ 0], offset);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_bytealign (w[17], w[18], offset);
      w[62] = hc_bytealign (w[16], w[17], offset);
      w[61] = hc_bytealign (w[15], w[16], offset);
      w[60] = hc_bytealign (w[14], w[15], offset);
      w[59] = hc_bytealign (w[13], w[14], offset);
      w[58] = hc_bytealign (w[12], w[13], offset);
      w[57] = hc_bytealign (w[11], w[12], offset);
      w[56] = hc_bytealign (w[10], w[11], offset);
      w[55] = hc_bytealign (w[ 9], w[10], offset);
      w[54] = hc_bytealign (w[ 8], w[ 9], offset);
      w[53] = hc_bytealign (w[ 7], w[ 8], offset);
      w[52] = hc_bytealign (w[ 6], w[ 7], offset);
      w[51] = hc_bytealign (w[ 5], w[ 6], offset);
      w[50] = hc_bytealign (w[ 4], w[ 5], offset);
      w[49] = hc_bytealign (w[ 3], w[ 4], offset);
      w[48] = hc_bytealign (w[ 2], w[ 3], offset);
      w[47] = hc_bytealign (w[ 1], w[ 2], offset);
      w[46] = hc_bytealign (w[ 0], w[ 1], offset);
      w[45] = hc_bytealign (    0, w[ 0], offset);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_bytealign (w[16], w[17], offset);
      w[62] = hc_bytealign (w[15], w[16], offset);
      w[61] = hc_bytealign (w[14], w[15], offset);
      w[60] = hc_bytealign (w[13], w[14], offset);
      w[59] = hc_bytealign (w[12], w[13], offset);
      w[58] = hc_bytealign (w[11], w[12], offset);
      w[57] = hc_bytealign (w[10], w[11], offset);
      w[56] = hc_bytealign (w[ 9], w[10], offset);
      w[55] = hc_bytealign (w[ 8], w[ 9], offset);
      w[54] = hc_bytealign (w[ 7], w[ 8], offset);
      w[53] = hc_bytealign (w[ 6], w[ 7], offset);
      w[52] = hc_bytealign (w[ 5], w[ 6], offset);
      w[51] = hc_bytealign (w[ 4], w[ 5], offset);
      w[50] = hc_bytealign (w[ 3], w[ 4], offset);
      w[49] = hc_bytealign (w[ 2], w[ 3], offset);
      w[48] = hc_bytealign (w[ 1], w[ 2], offset);
      w[47] = hc_bytealign (w[ 0], w[ 1], offset);
      w[46] = hc_bytealign (    0, w[ 0], offset);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_bytealign (w[15], w[16], offset);
      w[62] = hc_bytealign (w[14], w[15], offset);
      w[61] = hc_bytealign (w[13], w[14], offset);
      w[60] = hc_bytealign (w[12], w[13], offset);
      w[59] = hc_bytealign (w[11], w[12], offset);
      w[58] = hc_bytealign (w[10], w[11], offset);
      w[57] = hc_bytealign (w[ 9], w[10], offset);
      w[56] = hc_bytealign (w[ 8], w[ 9], offset);
      w[55] = hc_bytealign (w[ 7], w[ 8], offset);
      w[54] = hc_bytealign (w[ 6], w[ 7], offset);
      w[53] = hc_bytealign (w[ 5], w[ 6], offset);
      w[52] = hc_bytealign (w[ 4], w[ 5], offset);
      w[51] = hc_bytealign (w[ 3], w[ 4], offset);
      w[50] = hc_bytealign (w[ 2], w[ 3], offset);
      w[49] = hc_bytealign (w[ 1], w[ 2], offset);
      w[48] = hc_bytealign (w[ 0], w[ 1], offset);
      w[47] = hc_bytealign (    0, w[ 0], offset);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_bytealign (w[14], w[15], offset);
      w[62] = hc_bytealign (w[13], w[14], offset);
      w[61] = hc_bytealign (w[12], w[13], offset);
      w[60] = hc_bytealign (w[11], w[12], offset);
      w[59] = hc_bytealign (w[10], w[11], offset);
      w[58] = hc_bytealign (w[ 9], w[10], offset);
      w[57] = hc_bytealign (w[ 8], w[ 9], offset);
      w[56] = hc_bytealign (w[ 7], w[ 8], offset);
      w[55] = hc_bytealign (w[ 6], w[ 7], offset);
      w[54] = hc_bytealign (w[ 5], w[ 6], offset);
      w[53] = hc_bytealign (w[ 4], w[ 5], offset);
      w[52] = hc_bytealign (w[ 3], w[ 4], offset);
      w[51] = hc_bytealign (w[ 2], w[ 3], offset);
      w[50] = hc_bytealign (w[ 1], w[ 2], offset);
      w[49] = hc_bytealign (w[ 0], w[ 1], offset);
      w[48] = hc_bytealign (    0, w[ 0], offset);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_bytealign (w[13], w[14], offset);
      w[62] = hc_bytealign (w[12], w[13], offset);
      w[61] = hc_bytealign (w[11], w[12], offset);
      w[60] = hc_bytealign (w[10], w[11], offset);
      w[59] = hc_bytealign (w[ 9], w[10], offset);
      w[58] = hc_bytealign (w[ 8], w[ 9], offset);
      w[57] = hc_bytealign (w[ 7], w[ 8], offset);
      w[56] = hc_bytealign (w[ 6], w[ 7], offset);
      w[55] = hc_bytealign (w[ 5], w[ 6], offset);
      w[54] = hc_bytealign (w[ 4], w[ 5], offset);
      w[53] = hc_bytealign (w[ 3], w[ 4], offset);
      w[52] = hc_bytealign (w[ 2], w[ 3], offset);
      w[51] = hc_bytealign (w[ 1], w[ 2], offset);
      w[50] = hc_bytealign (w[ 0], w[ 1], offset);
      w[49] = hc_bytealign (    0, w[ 0], offset);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_bytealign (w[12], w[13], offset);
      w[62] = hc_bytealign (w[11], w[12], offset);
      w[61] = hc_bytealign (w[10], w[11], offset);
      w[60] = hc_bytealign (w[ 9], w[10], offset);
      w[59] = hc_bytealign (w[ 8], w[ 9], offset);
      w[58] = hc_bytealign (w[ 7], w[ 8], offset);
      w[57] = hc_bytealign (w[ 6], w[ 7], offset);
      w[56] = hc_bytealign (w[ 5], w[ 6], offset);
      w[55] = hc_bytealign (w[ 4], w[ 5], offset);
      w[54] = hc_bytealign (w[ 3], w[ 4], offset);
      w[53] = hc_bytealign (w[ 2], w[ 3], offset);
      w[52] = hc_bytealign (w[ 1], w[ 2], offset);
      w[51] = hc_bytealign (w[ 0], w[ 1], offset);
      w[50] = hc_bytealign (    0, w[ 0], offset);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_bytealign (w[11], w[12], offset);
      w[62] = hc_bytealign (w[10], w[11], offset);
      w[61] = hc_bytealign (w[ 9], w[10], offset);
      w[60] = hc_bytealign (w[ 8], w[ 9], offset);
      w[59] = hc_bytealign (w[ 7], w[ 8], offset);
      w[58] = hc_bytealign (w[ 6], w[ 7], offset);
      w[57] = hc_bytealign (w[ 5], w[ 6], offset);
      w[56] = hc_bytealign (w[ 4], w[ 5], offset);
      w[55] = hc_bytealign (w[ 3], w[ 4], offset);
      w[54] = hc_bytealign (w[ 2], w[ 3], offset);
      w[53] = hc_bytealign (w[ 1], w[ 2], offset);
      w[52] = hc_bytealign (w[ 0], w[ 1], offset);
      w[51] = hc_bytealign (    0, w[ 0], offset);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_bytealign (w[10], w[11], offset);
      w[62] = hc_bytealign (w[ 9], w[10], offset);
      w[61] = hc_bytealign (w[ 8], w[ 9], offset);
      w[60] = hc_bytealign (w[ 7], w[ 8], offset);
      w[59] = hc_bytealign (w[ 6], w[ 7], offset);
      w[58] = hc_bytealign (w[ 5], w[ 6], offset);
      w[57] = hc_bytealign (w[ 4], w[ 5], offset);
      w[56] = hc_bytealign (w[ 3], w[ 4], offset);
      w[55] = hc_bytealign (w[ 2], w[ 3], offset);
      w[54] = hc_bytealign (w[ 1], w[ 2], offset);
      w[53] = hc_bytealign (w[ 0], w[ 1], offset);
      w[52] = hc_bytealign (    0, w[ 0], offset);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_bytealign (w[ 9], w[10], offset);
      w[62] = hc_bytealign (w[ 8], w[ 9], offset);
      w[61] = hc_bytealign (w[ 7], w[ 8], offset);
      w[60] = hc_bytealign (w[ 6], w[ 7], offset);
      w[59] = hc_bytealign (w[ 5], w[ 6], offset);
      w[58] = hc_bytealign (w[ 4], w[ 5], offset);
      w[57] = hc_bytealign (w[ 3], w[ 4], offset);
      w[56] = hc_bytealign (w[ 2], w[ 3], offset);
      w[55] = hc_bytealign (w[ 1], w[ 2], offset);
      w[54] = hc_bytealign (w[ 0], w[ 1], offset);
      w[53] = hc_bytealign (    0, w[ 0], offset);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_bytealign (w[ 8], w[ 9], offset);
      w[62] = hc_bytealign (w[ 7], w[ 8], offset);
      w[61] = hc_bytealign (w[ 6], w[ 7], offset);
      w[60] = hc_bytealign (w[ 5], w[ 6], offset);
      w[59] = hc_bytealign (w[ 4], w[ 5], offset);
      w[58] = hc_bytealign (w[ 3], w[ 4], offset);
      w[57] = hc_bytealign (w[ 2], w[ 3], offset);
      w[56] = hc_bytealign (w[ 1], w[ 2], offset);
      w[55] = hc_bytealign (w[ 0], w[ 1], offset);
      w[54] = hc_bytealign (    0, w[ 0], offset);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_bytealign (w[ 7], w[ 8], offset);
      w[62] = hc_bytealign (w[ 6], w[ 7], offset);
      w[61] = hc_bytealign (w[ 5], w[ 6], offset);
      w[60] = hc_bytealign (w[ 4], w[ 5], offset);
      w[59] = hc_bytealign (w[ 3], w[ 4], offset);
      w[58] = hc_bytealign (w[ 2], w[ 3], offset);
      w[57] = hc_bytealign (w[ 1], w[ 2], offset);
      w[56] = hc_bytealign (w[ 0], w[ 1], offset);
      w[55] = hc_bytealign (    0, w[ 0], offset);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_bytealign (w[ 6], w[ 7], offset);
      w[62] = hc_bytealign (w[ 5], w[ 6], offset);
      w[61] = hc_bytealign (w[ 4], w[ 5], offset);
      w[60] = hc_bytealign (w[ 3], w[ 4], offset);
      w[59] = hc_bytealign (w[ 2], w[ 3], offset);
      w[58] = hc_bytealign (w[ 1], w[ 2], offset);
      w[57] = hc_bytealign (w[ 0], w[ 1], offset);
      w[56] = hc_bytealign (    0, w[ 0], offset);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_bytealign (w[ 5], w[ 6], offset);
      w[62] = hc_bytealign (w[ 4], w[ 5], offset);
      w[61] = hc_bytealign (w[ 3], w[ 4], offset);
      w[60] = hc_bytealign (w[ 2], w[ 3], offset);
      w[59] = hc_bytealign (w[ 1], w[ 2], offset);
      w[58] = hc_bytealign (w[ 0], w[ 1], offset);
      w[57] = hc_bytealign (    0, w[ 0], offset);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_bytealign (w[ 4], w[ 5], offset);
      w[62] = hc_bytealign (w[ 3], w[ 4], offset);
      w[61] = hc_bytealign (w[ 2], w[ 3], offset);
      w[60] = hc_bytealign (w[ 1], w[ 2], offset);
      w[59] = hc_bytealign (w[ 0], w[ 1], offset);
      w[58] = hc_bytealign (    0, w[ 0], offset);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_bytealign (w[ 3], w[ 4], offset);
      w[62] = hc_bytealign (w[ 2], w[ 3], offset);
      w[61] = hc_bytealign (w[ 1], w[ 2], offset);
      w[60] = hc_bytealign (w[ 0], w[ 1], offset);
      w[59] = hc_bytealign (    0, w[ 0], offset);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_bytealign (w[ 2], w[ 3], offset);
      w[62] = hc_bytealign (w[ 1], w[ 2], offset);
      w[61] = hc_bytealign (w[ 0], w[ 1], offset);
      w[60] = hc_bytealign (    0, w[ 0], offset);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_bytealign (w[ 1], w[ 2], offset);
      w[62] = hc_bytealign (w[ 0], w[ 1], offset);
      w[61] = hc_bytealign (    0, w[ 0], offset);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_bytealign (w[ 0], w[ 1], offset);
      w[62] = hc_bytealign (    0, w[ 0], offset);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_bytealign (    0, w[ 0], offset);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV

  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;

  #if defined IS_NV
  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
  #endif

  #if (defined IS_AMD || defined IS_HIP)
  const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
  #endif

  switch (offset_switch)
  {
    case  0:
      w[63] = hc_byte_perm (w[62], w[63], selector);
      w[62] = hc_byte_perm (w[61], w[62], selector);
      w[61] = hc_byte_perm (w[60], w[61], selector);
      w[60] = hc_byte_perm (w[59], w[60], selector);
      w[59] = hc_byte_perm (w[58], w[59], selector);
      w[58] = hc_byte_perm (w[57], w[58], selector);
      w[57] = hc_byte_perm (w[56], w[57], selector);
      w[56] = hc_byte_perm (w[55], w[56], selector);
      w[55] = hc_byte_perm (w[54], w[55], selector);
      w[54] = hc_byte_perm (w[53], w[54], selector);
      w[53] = hc_byte_perm (w[52], w[53], selector);
      w[52] = hc_byte_perm (w[51], w[52], selector);
      w[51] = hc_byte_perm (w[50], w[51], selector);
      w[50] = hc_byte_perm (w[49], w[50], selector);
      w[49] = hc_byte_perm (w[48], w[49], selector);
      w[48] = hc_byte_perm (w[47], w[48], selector);
      w[47] = hc_byte_perm (w[46], w[47], selector);
      w[46] = hc_byte_perm (w[45], w[46], selector);
      w[45] = hc_byte_perm (w[44], w[45], selector);
      w[44] = hc_byte_perm (w[43], w[44], selector);
      w[43] = hc_byte_perm (w[42], w[43], selector);
      w[42] = hc_byte_perm (w[41], w[42], selector);
      w[41] = hc_byte_perm (w[40], w[41], selector);
      w[40] = hc_byte_perm (w[39], w[40], selector);
      w[39] = hc_byte_perm (w[38], w[39], selector);
      w[38] = hc_byte_perm (w[37], w[38], selector);
      w[37] = hc_byte_perm (w[36], w[37], selector);
      w[36] = hc_byte_perm (w[35], w[36], selector);
      w[35] = hc_byte_perm (w[34], w[35], selector);
      w[34] = hc_byte_perm (w[33], w[34], selector);
      w[33] = hc_byte_perm (w[32], w[33], selector);
      w[32] = hc_byte_perm (w[31], w[32], selector);
      w[31] = hc_byte_perm (w[30], w[31], selector);
      w[30] = hc_byte_perm (w[29], w[30], selector);
      w[29] = hc_byte_perm (w[28], w[29], selector);
      w[28] = hc_byte_perm (w[27], w[28], selector);
      w[27] = hc_byte_perm (w[26], w[27], selector);
      w[26] = hc_byte_perm (w[25], w[26], selector);
      w[25] = hc_byte_perm (w[24], w[25], selector);
      w[24] = hc_byte_perm (w[23], w[24], selector);
      w[23] = hc_byte_perm (w[22], w[23], selector);
      w[22] = hc_byte_perm (w[21], w[22], selector);
      w[21] = hc_byte_perm (w[20], w[21], selector);
      w[20] = hc_byte_perm (w[19], w[20], selector);
      w[19] = hc_byte_perm (w[18], w[19], selector);
      w[18] = hc_byte_perm (w[17], w[18], selector);
      w[17] = hc_byte_perm (w[16], w[17], selector);
      w[16] = hc_byte_perm (w[15], w[16], selector);
      w[15] = hc_byte_perm (w[14], w[15], selector);
      w[14] = hc_byte_perm (w[13], w[14], selector);
      w[13] = hc_byte_perm (w[12], w[13], selector);
      w[12] = hc_byte_perm (w[11], w[12], selector);
      w[11] = hc_byte_perm (w[10], w[11], selector);
      w[10] = hc_byte_perm (w[ 9], w[10], selector);
      w[ 9] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[ 8] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 7] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 6] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 5] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 4] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 3] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 2] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 1] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 0] = hc_byte_perm (    0, w[ 0], selector);

      break;

    case  1:
      w[63] = hc_byte_perm (w[61], w[62], selector);
      w[62] = hc_byte_perm (w[60], w[61], selector);
      w[61] = hc_byte_perm (w[59], w[60], selector);
      w[60] = hc_byte_perm (w[58], w[59], selector);
      w[59] = hc_byte_perm (w[57], w[58], selector);
      w[58] = hc_byte_perm (w[56], w[57], selector);
      w[57] = hc_byte_perm (w[55], w[56], selector);
      w[56] = hc_byte_perm (w[54], w[55], selector);
      w[55] = hc_byte_perm (w[53], w[54], selector);
      w[54] = hc_byte_perm (w[52], w[53], selector);
      w[53] = hc_byte_perm (w[51], w[52], selector);
      w[52] = hc_byte_perm (w[50], w[51], selector);
      w[51] = hc_byte_perm (w[49], w[50], selector);
      w[50] = hc_byte_perm (w[48], w[49], selector);
      w[49] = hc_byte_perm (w[47], w[48], selector);
      w[48] = hc_byte_perm (w[46], w[47], selector);
      w[47] = hc_byte_perm (w[45], w[46], selector);
      w[46] = hc_byte_perm (w[44], w[45], selector);
      w[45] = hc_byte_perm (w[43], w[44], selector);
      w[44] = hc_byte_perm (w[42], w[43], selector);
      w[43] = hc_byte_perm (w[41], w[42], selector);
      w[42] = hc_byte_perm (w[40], w[41], selector);
      w[41] = hc_byte_perm (w[39], w[40], selector);
      w[40] = hc_byte_perm (w[38], w[39], selector);
      w[39] = hc_byte_perm (w[37], w[38], selector);
      w[38] = hc_byte_perm (w[36], w[37], selector);
      w[37] = hc_byte_perm (w[35], w[36], selector);
      w[36] = hc_byte_perm (w[34], w[35], selector);
      w[35] = hc_byte_perm (w[33], w[34], selector);
      w[34] = hc_byte_perm (w[32], w[33], selector);
      w[33] = hc_byte_perm (w[31], w[32], selector);
      w[32] = hc_byte_perm (w[30], w[31], selector);
      w[31] = hc_byte_perm (w[29], w[30], selector);
      w[30] = hc_byte_perm (w[28], w[29], selector);
      w[29] = hc_byte_perm (w[27], w[28], selector);
      w[28] = hc_byte_perm (w[26], w[27], selector);
      w[27] = hc_byte_perm (w[25], w[26], selector);
      w[26] = hc_byte_perm (w[24], w[25], selector);
      w[25] = hc_byte_perm (w[23], w[24], selector);
      w[24] = hc_byte_perm (w[22], w[23], selector);
      w[23] = hc_byte_perm (w[21], w[22], selector);
      w[22] = hc_byte_perm (w[20], w[21], selector);
      w[21] = hc_byte_perm (w[19], w[20], selector);
      w[20] = hc_byte_perm (w[18], w[19], selector);
      w[19] = hc_byte_perm (w[17], w[18], selector);
      w[18] = hc_byte_perm (w[16], w[17], selector);
      w[17] = hc_byte_perm (w[15], w[16], selector);
      w[16] = hc_byte_perm (w[14], w[15], selector);
      w[15] = hc_byte_perm (w[13], w[14], selector);
      w[14] = hc_byte_perm (w[12], w[13], selector);
      w[13] = hc_byte_perm (w[11], w[12], selector);
      w[12] = hc_byte_perm (w[10], w[11], selector);
      w[11] = hc_byte_perm (w[ 9], w[10], selector);
      w[10] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[ 9] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 8] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 7] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 6] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 5] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 4] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 3] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 2] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 1] = hc_byte_perm (    0, w[ 0], selector);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_byte_perm (w[60], w[61], selector);
      w[62] = hc_byte_perm (w[59], w[60], selector);
      w[61] = hc_byte_perm (w[58], w[59], selector);
      w[60] = hc_byte_perm (w[57], w[58], selector);
      w[59] = hc_byte_perm (w[56], w[57], selector);
      w[58] = hc_byte_perm (w[55], w[56], selector);
      w[57] = hc_byte_perm (w[54], w[55], selector);
      w[56] = hc_byte_perm (w[53], w[54], selector);
      w[55] = hc_byte_perm (w[52], w[53], selector);
      w[54] = hc_byte_perm (w[51], w[52], selector);
      w[53] = hc_byte_perm (w[50], w[51], selector);
      w[52] = hc_byte_perm (w[49], w[50], selector);
      w[51] = hc_byte_perm (w[48], w[49], selector);
      w[50] = hc_byte_perm (w[47], w[48], selector);
      w[49] = hc_byte_perm (w[46], w[47], selector);
      w[48] = hc_byte_perm (w[45], w[46], selector);
      w[47] = hc_byte_perm (w[44], w[45], selector);
      w[46] = hc_byte_perm (w[43], w[44], selector);
      w[45] = hc_byte_perm (w[42], w[43], selector);
      w[44] = hc_byte_perm (w[41], w[42], selector);
      w[43] = hc_byte_perm (w[40], w[41], selector);
      w[42] = hc_byte_perm (w[39], w[40], selector);
      w[41] = hc_byte_perm (w[38], w[39], selector);
      w[40] = hc_byte_perm (w[37], w[38], selector);
      w[39] = hc_byte_perm (w[36], w[37], selector);
      w[38] = hc_byte_perm (w[35], w[36], selector);
      w[37] = hc_byte_perm (w[34], w[35], selector);
      w[36] = hc_byte_perm (w[33], w[34], selector);
      w[35] = hc_byte_perm (w[32], w[33], selector);
      w[34] = hc_byte_perm (w[31], w[32], selector);
      w[33] = hc_byte_perm (w[30], w[31], selector);
      w[32] = hc_byte_perm (w[29], w[30], selector);
      w[31] = hc_byte_perm (w[28], w[29], selector);
      w[30] = hc_byte_perm (w[27], w[28], selector);
      w[29] = hc_byte_perm (w[26], w[27], selector);
      w[28] = hc_byte_perm (w[25], w[26], selector);
      w[27] = hc_byte_perm (w[24], w[25], selector);
      w[26] = hc_byte_perm (w[23], w[24], selector);
      w[25] = hc_byte_perm (w[22], w[23], selector);
      w[24] = hc_byte_perm (w[21], w[22], selector);
      w[23] = hc_byte_perm (w[20], w[21], selector);
      w[22] = hc_byte_perm (w[19], w[20], selector);
      w[21] = hc_byte_perm (w[18], w[19], selector);
      w[20] = hc_byte_perm (w[17], w[18], selector);
      w[19] = hc_byte_perm (w[16], w[17], selector);
      w[18] = hc_byte_perm (w[15], w[16], selector);
      w[17] = hc_byte_perm (w[14], w[15], selector);
      w[16] = hc_byte_perm (w[13], w[14], selector);
      w[15] = hc_byte_perm (w[12], w[13], selector);
      w[14] = hc_byte_perm (w[11], w[12], selector);
      w[13] = hc_byte_perm (w[10], w[11], selector);
      w[12] = hc_byte_perm (w[ 9], w[10], selector);
      w[11] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[10] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[ 9] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 8] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 7] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 6] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 5] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 4] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 3] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 2] = hc_byte_perm (    0, w[ 0], selector);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_byte_perm (w[59], w[60], selector);
      w[62] = hc_byte_perm (w[58], w[59], selector);
      w[61] = hc_byte_perm (w[57], w[58], selector);
      w[60] = hc_byte_perm (w[56], w[57], selector);
      w[59] = hc_byte_perm (w[55], w[56], selector);
      w[58] = hc_byte_perm (w[54], w[55], selector);
      w[57] = hc_byte_perm (w[53], w[54], selector);
      w[56] = hc_byte_perm (w[52], w[53], selector);
      w[55] = hc_byte_perm (w[51], w[52], selector);
      w[54] = hc_byte_perm (w[50], w[51], selector);
      w[53] = hc_byte_perm (w[49], w[50], selector);
      w[52] = hc_byte_perm (w[48], w[49], selector);
      w[51] = hc_byte_perm (w[47], w[48], selector);
      w[50] = hc_byte_perm (w[46], w[47], selector);
      w[49] = hc_byte_perm (w[45], w[46], selector);
      w[48] = hc_byte_perm (w[44], w[45], selector);
      w[47] = hc_byte_perm (w[43], w[44], selector);
      w[46] = hc_byte_perm (w[42], w[43], selector);
      w[45] = hc_byte_perm (w[41], w[42], selector);
      w[44] = hc_byte_perm (w[40], w[41], selector);
      w[43] = hc_byte_perm (w[39], w[40], selector);
      w[42] = hc_byte_perm (w[38], w[39], selector);
      w[41] = hc_byte_perm (w[37], w[38], selector);
      w[40] = hc_byte_perm (w[36], w[37], selector);
      w[39] = hc_byte_perm (w[35], w[36], selector);
      w[38] = hc_byte_perm (w[34], w[35], selector);
      w[37] = hc_byte_perm (w[33], w[34], selector);
      w[36] = hc_byte_perm (w[32], w[33], selector);
      w[35] = hc_byte_perm (w[31], w[32], selector);
      w[34] = hc_byte_perm (w[30], w[31], selector);
      w[33] = hc_byte_perm (w[29], w[30], selector);
      w[32] = hc_byte_perm (w[28], w[29], selector);
      w[31] = hc_byte_perm (w[27], w[28], selector);
      w[30] = hc_byte_perm (w[26], w[27], selector);
      w[29] = hc_byte_perm (w[25], w[26], selector);
      w[28] = hc_byte_perm (w[24], w[25], selector);
      w[27] = hc_byte_perm (w[23], w[24], selector);
      w[26] = hc_byte_perm (w[22], w[23], selector);
      w[25] = hc_byte_perm (w[21], w[22], selector);
      w[24] = hc_byte_perm (w[20], w[21], selector);
      w[23] = hc_byte_perm (w[19], w[20], selector);
      w[22] = hc_byte_perm (w[18], w[19], selector);
      w[21] = hc_byte_perm (w[17], w[18], selector);
      w[20] = hc_byte_perm (w[16], w[17], selector);
      w[19] = hc_byte_perm (w[15], w[16], selector);
      w[18] = hc_byte_perm (w[14], w[15], selector);
      w[17] = hc_byte_perm (w[13], w[14], selector);
      w[16] = hc_byte_perm (w[12], w[13], selector);
      w[15] = hc_byte_perm (w[11], w[12], selector);
      w[14] = hc_byte_perm (w[10], w[11], selector);
      w[13] = hc_byte_perm (w[ 9], w[10], selector);
      w[12] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[11] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[10] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[ 9] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 8] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 7] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 6] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 5] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 4] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 3] = hc_byte_perm (    0, w[ 0], selector);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_byte_perm (w[58], w[59], selector);
      w[62] = hc_byte_perm (w[57], w[58], selector);
      w[61] = hc_byte_perm (w[56], w[57], selector);
      w[60] = hc_byte_perm (w[55], w[56], selector);
      w[59] = hc_byte_perm (w[54], w[55], selector);
      w[58] = hc_byte_perm (w[53], w[54], selector);
      w[57] = hc_byte_perm (w[52], w[53], selector);
      w[56] = hc_byte_perm (w[51], w[52], selector);
      w[55] = hc_byte_perm (w[50], w[51], selector);
      w[54] = hc_byte_perm (w[49], w[50], selector);
      w[53] = hc_byte_perm (w[48], w[49], selector);
      w[52] = hc_byte_perm (w[47], w[48], selector);
      w[51] = hc_byte_perm (w[46], w[47], selector);
      w[50] = hc_byte_perm (w[45], w[46], selector);
      w[49] = hc_byte_perm (w[44], w[45], selector);
      w[48] = hc_byte_perm (w[43], w[44], selector);
      w[47] = hc_byte_perm (w[42], w[43], selector);
      w[46] = hc_byte_perm (w[41], w[42], selector);
      w[45] = hc_byte_perm (w[40], w[41], selector);
      w[44] = hc_byte_perm (w[39], w[40], selector);
      w[43] = hc_byte_perm (w[38], w[39], selector);
      w[42] = hc_byte_perm (w[37], w[38], selector);
      w[41] = hc_byte_perm (w[36], w[37], selector);
      w[40] = hc_byte_perm (w[35], w[36], selector);
      w[39] = hc_byte_perm (w[34], w[35], selector);
      w[38] = hc_byte_perm (w[33], w[34], selector);
      w[37] = hc_byte_perm (w[32], w[33], selector);
      w[36] = hc_byte_perm (w[31], w[32], selector);
      w[35] = hc_byte_perm (w[30], w[31], selector);
      w[34] = hc_byte_perm (w[29], w[30], selector);
      w[33] = hc_byte_perm (w[28], w[29], selector);
      w[32] = hc_byte_perm (w[27], w[28], selector);
      w[31] = hc_byte_perm (w[26], w[27], selector);
      w[30] = hc_byte_perm (w[25], w[26], selector);
      w[29] = hc_byte_perm (w[24], w[25], selector);
      w[28] = hc_byte_perm (w[23], w[24], selector);
      w[27] = hc_byte_perm (w[22], w[23], selector);
      w[26] = hc_byte_perm (w[21], w[22], selector);
      w[25] = hc_byte_perm (w[20], w[21], selector);
      w[24] = hc_byte_perm (w[19], w[20], selector);
      w[23] = hc_byte_perm (w[18], w[19], selector);
      w[22] = hc_byte_perm (w[17], w[18], selector);
      w[21] = hc_byte_perm (w[16], w[17], selector);
      w[20] = hc_byte_perm (w[15], w[16], selector);
      w[19] = hc_byte_perm (w[14], w[15], selector);
      w[18] = hc_byte_perm (w[13], w[14], selector);
      w[17] = hc_byte_perm (w[12], w[13], selector);
      w[16] = hc_byte_perm (w[11], w[12], selector);
      w[15] = hc_byte_perm (w[10], w[11], selector);
      w[14] = hc_byte_perm (w[ 9], w[10], selector);
      w[13] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[12] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[11] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[10] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[ 9] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 8] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 7] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 6] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 5] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 4] = hc_byte_perm (    0, w[ 0], selector);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_byte_perm (w[57], w[58], selector);
      w[62] = hc_byte_perm (w[56], w[57], selector);
      w[61] = hc_byte_perm (w[55], w[56], selector);
      w[60] = hc_byte_perm (w[54], w[55], selector);
      w[59] = hc_byte_perm (w[53], w[54], selector);
      w[58] = hc_byte_perm (w[52], w[53], selector);
      w[57] = hc_byte_perm (w[51], w[52], selector);
      w[56] = hc_byte_perm (w[50], w[51], selector);
      w[55] = hc_byte_perm (w[49], w[50], selector);
      w[54] = hc_byte_perm (w[48], w[49], selector);
      w[53] = hc_byte_perm (w[47], w[48], selector);
      w[52] = hc_byte_perm (w[46], w[47], selector);
      w[51] = hc_byte_perm (w[45], w[46], selector);
      w[50] = hc_byte_perm (w[44], w[45], selector);
      w[49] = hc_byte_perm (w[43], w[44], selector);
      w[48] = hc_byte_perm (w[42], w[43], selector);
      w[47] = hc_byte_perm (w[41], w[42], selector);
      w[46] = hc_byte_perm (w[40], w[41], selector);
      w[45] = hc_byte_perm (w[39], w[40], selector);
      w[44] = hc_byte_perm (w[38], w[39], selector);
      w[43] = hc_byte_perm (w[37], w[38], selector);
      w[42] = hc_byte_perm (w[36], w[37], selector);
      w[41] = hc_byte_perm (w[35], w[36], selector);
      w[40] = hc_byte_perm (w[34], w[35], selector);
      w[39] = hc_byte_perm (w[33], w[34], selector);
      w[38] = hc_byte_perm (w[32], w[33], selector);
      w[37] = hc_byte_perm (w[31], w[32], selector);
      w[36] = hc_byte_perm (w[30], w[31], selector);
      w[35] = hc_byte_perm (w[29], w[30], selector);
      w[34] = hc_byte_perm (w[28], w[29], selector);
      w[33] = hc_byte_perm (w[27], w[28], selector);
      w[32] = hc_byte_perm (w[26], w[27], selector);
      w[31] = hc_byte_perm (w[25], w[26], selector);
      w[30] = hc_byte_perm (w[24], w[25], selector);
      w[29] = hc_byte_perm (w[23], w[24], selector);
      w[28] = hc_byte_perm (w[22], w[23], selector);
      w[27] = hc_byte_perm (w[21], w[22], selector);
      w[26] = hc_byte_perm (w[20], w[21], selector);
      w[25] = hc_byte_perm (w[19], w[20], selector);
      w[24] = hc_byte_perm (w[18], w[19], selector);
      w[23] = hc_byte_perm (w[17], w[18], selector);
      w[22] = hc_byte_perm (w[16], w[17], selector);
      w[21] = hc_byte_perm (w[15], w[16], selector);
      w[20] = hc_byte_perm (w[14], w[15], selector);
      w[19] = hc_byte_perm (w[13], w[14], selector);
      w[18] = hc_byte_perm (w[12], w[13], selector);
      w[17] = hc_byte_perm (w[11], w[12], selector);
      w[16] = hc_byte_perm (w[10], w[11], selector);
      w[15] = hc_byte_perm (w[ 9], w[10], selector);
      w[14] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[13] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[12] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[11] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[10] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[ 9] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 8] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 7] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 6] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 5] = hc_byte_perm (    0, w[ 0], selector);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_byte_perm (w[56], w[57], selector);
      w[62] = hc_byte_perm (w[55], w[56], selector);
      w[61] = hc_byte_perm (w[54], w[55], selector);
      w[60] = hc_byte_perm (w[53], w[54], selector);
      w[59] = hc_byte_perm (w[52], w[53], selector);
      w[58] = hc_byte_perm (w[51], w[52], selector);
      w[57] = hc_byte_perm (w[50], w[51], selector);
      w[56] = hc_byte_perm (w[49], w[50], selector);
      w[55] = hc_byte_perm (w[48], w[49], selector);
      w[54] = hc_byte_perm (w[47], w[48], selector);
      w[53] = hc_byte_perm (w[46], w[47], selector);
      w[52] = hc_byte_perm (w[45], w[46], selector);
      w[51] = hc_byte_perm (w[44], w[45], selector);
      w[50] = hc_byte_perm (w[43], w[44], selector);
      w[49] = hc_byte_perm (w[42], w[43], selector);
      w[48] = hc_byte_perm (w[41], w[42], selector);
      w[47] = hc_byte_perm (w[40], w[41], selector);
      w[46] = hc_byte_perm (w[39], w[40], selector);
      w[45] = hc_byte_perm (w[38], w[39], selector);
      w[44] = hc_byte_perm (w[37], w[38], selector);
      w[43] = hc_byte_perm (w[36], w[37], selector);
      w[42] = hc_byte_perm (w[35], w[36], selector);
      w[41] = hc_byte_perm (w[34], w[35], selector);
      w[40] = hc_byte_perm (w[33], w[34], selector);
      w[39] = hc_byte_perm (w[32], w[33], selector);
      w[38] = hc_byte_perm (w[31], w[32], selector);
      w[37] = hc_byte_perm (w[30], w[31], selector);
      w[36] = hc_byte_perm (w[29], w[30], selector);
      w[35] = hc_byte_perm (w[28], w[29], selector);
      w[34] = hc_byte_perm (w[27], w[28], selector);
      w[33] = hc_byte_perm (w[26], w[27], selector);
      w[32] = hc_byte_perm (w[25], w[26], selector);
      w[31] = hc_byte_perm (w[24], w[25], selector);
      w[30] = hc_byte_perm (w[23], w[24], selector);
      w[29] = hc_byte_perm (w[22], w[23], selector);
      w[28] = hc_byte_perm (w[21], w[22], selector);
      w[27] = hc_byte_perm (w[20], w[21], selector);
      w[26] = hc_byte_perm (w[19], w[20], selector);
      w[25] = hc_byte_perm (w[18], w[19], selector);
      w[24] = hc_byte_perm (w[17], w[18], selector);
      w[23] = hc_byte_perm (w[16], w[17], selector);
      w[22] = hc_byte_perm (w[15], w[16], selector);
      w[21] = hc_byte_perm (w[14], w[15], selector);
      w[20] = hc_byte_perm (w[13], w[14], selector);
      w[19] = hc_byte_perm (w[12], w[13], selector);
      w[18] = hc_byte_perm (w[11], w[12], selector);
      w[17] = hc_byte_perm (w[10], w[11], selector);
      w[16] = hc_byte_perm (w[ 9], w[10], selector);
      w[15] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[14] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[13] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[12] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[11] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[10] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[ 9] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 8] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 7] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 6] = hc_byte_perm (    0, w[ 0], selector);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_byte_perm (w[55], w[56], selector);
      w[62] = hc_byte_perm (w[54], w[55], selector);
      w[61] = hc_byte_perm (w[53], w[54], selector);
      w[60] = hc_byte_perm (w[52], w[53], selector);
      w[59] = hc_byte_perm (w[51], w[52], selector);
      w[58] = hc_byte_perm (w[50], w[51], selector);
      w[57] = hc_byte_perm (w[49], w[50], selector);
      w[56] = hc_byte_perm (w[48], w[49], selector);
      w[55] = hc_byte_perm (w[47], w[48], selector);
      w[54] = hc_byte_perm (w[46], w[47], selector);
      w[53] = hc_byte_perm (w[45], w[46], selector);
      w[52] = hc_byte_perm (w[44], w[45], selector);
      w[51] = hc_byte_perm (w[43], w[44], selector);
      w[50] = hc_byte_perm (w[42], w[43], selector);
      w[49] = hc_byte_perm (w[41], w[42], selector);
      w[48] = hc_byte_perm (w[40], w[41], selector);
      w[47] = hc_byte_perm (w[39], w[40], selector);
      w[46] = hc_byte_perm (w[38], w[39], selector);
      w[45] = hc_byte_perm (w[37], w[38], selector);
      w[44] = hc_byte_perm (w[36], w[37], selector);
      w[43] = hc_byte_perm (w[35], w[36], selector);
      w[42] = hc_byte_perm (w[34], w[35], selector);
      w[41] = hc_byte_perm (w[33], w[34], selector);
      w[40] = hc_byte_perm (w[32], w[33], selector);
      w[39] = hc_byte_perm (w[31], w[32], selector);
      w[38] = hc_byte_perm (w[30], w[31], selector);
      w[37] = hc_byte_perm (w[29], w[30], selector);
      w[36] = hc_byte_perm (w[28], w[29], selector);
      w[35] = hc_byte_perm (w[27], w[28], selector);
      w[34] = hc_byte_perm (w[26], w[27], selector);
      w[33] = hc_byte_perm (w[25], w[26], selector);
      w[32] = hc_byte_perm (w[24], w[25], selector);
      w[31] = hc_byte_perm (w[23], w[24], selector);
      w[30] = hc_byte_perm (w[22], w[23], selector);
      w[29] = hc_byte_perm (w[21], w[22], selector);
      w[28] = hc_byte_perm (w[20], w[21], selector);
      w[27] = hc_byte_perm (w[19], w[20], selector);
      w[26] = hc_byte_perm (w[18], w[19], selector);
      w[25] = hc_byte_perm (w[17], w[18], selector);
      w[24] = hc_byte_perm (w[16], w[17], selector);
      w[23] = hc_byte_perm (w[15], w[16], selector);
      w[22] = hc_byte_perm (w[14], w[15], selector);
      w[21] = hc_byte_perm (w[13], w[14], selector);
      w[20] = hc_byte_perm (w[12], w[13], selector);
      w[19] = hc_byte_perm (w[11], w[12], selector);
      w[18] = hc_byte_perm (w[10], w[11], selector);
      w[17] = hc_byte_perm (w[ 9], w[10], selector);
      w[16] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[15] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[14] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[13] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[12] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[11] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[10] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[ 9] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 8] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 7] = hc_byte_perm (    0, w[ 0], selector);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_byte_perm (w[54], w[55], selector);
      w[62] = hc_byte_perm (w[53], w[54], selector);
      w[61] = hc_byte_perm (w[52], w[53], selector);
      w[60] = hc_byte_perm (w[51], w[52], selector);
      w[59] = hc_byte_perm (w[50], w[51], selector);
      w[58] = hc_byte_perm (w[49], w[50], selector);
      w[57] = hc_byte_perm (w[48], w[49], selector);
      w[56] = hc_byte_perm (w[47], w[48], selector);
      w[55] = hc_byte_perm (w[46], w[47], selector);
      w[54] = hc_byte_perm (w[45], w[46], selector);
      w[53] = hc_byte_perm (w[44], w[45], selector);
      w[52] = hc_byte_perm (w[43], w[44], selector);
      w[51] = hc_byte_perm (w[42], w[43], selector);
      w[50] = hc_byte_perm (w[41], w[42], selector);
      w[49] = hc_byte_perm (w[40], w[41], selector);
      w[48] = hc_byte_perm (w[39], w[40], selector);
      w[47] = hc_byte_perm (w[38], w[39], selector);
      w[46] = hc_byte_perm (w[37], w[38], selector);
      w[45] = hc_byte_perm (w[36], w[37], selector);
      w[44] = hc_byte_perm (w[35], w[36], selector);
      w[43] = hc_byte_perm (w[34], w[35], selector);
      w[42] = hc_byte_perm (w[33], w[34], selector);
      w[41] = hc_byte_perm (w[32], w[33], selector);
      w[40] = hc_byte_perm (w[31], w[32], selector);
      w[39] = hc_byte_perm (w[30], w[31], selector);
      w[38] = hc_byte_perm (w[29], w[30], selector);
      w[37] = hc_byte_perm (w[28], w[29], selector);
      w[36] = hc_byte_perm (w[27], w[28], selector);
      w[35] = hc_byte_perm (w[26], w[27], selector);
      w[34] = hc_byte_perm (w[25], w[26], selector);
      w[33] = hc_byte_perm (w[24], w[25], selector);
      w[32] = hc_byte_perm (w[23], w[24], selector);
      w[31] = hc_byte_perm (w[22], w[23], selector);
      w[30] = hc_byte_perm (w[21], w[22], selector);
      w[29] = hc_byte_perm (w[20], w[21], selector);
      w[28] = hc_byte_perm (w[19], w[20], selector);
      w[27] = hc_byte_perm (w[18], w[19], selector);
      w[26] = hc_byte_perm (w[17], w[18], selector);
      w[25] = hc_byte_perm (w[16], w[17], selector);
      w[24] = hc_byte_perm (w[15], w[16], selector);
      w[23] = hc_byte_perm (w[14], w[15], selector);
      w[22] = hc_byte_perm (w[13], w[14], selector);
      w[21] = hc_byte_perm (w[12], w[13], selector);
      w[20] = hc_byte_perm (w[11], w[12], selector);
      w[19] = hc_byte_perm (w[10], w[11], selector);
      w[18] = hc_byte_perm (w[ 9], w[10], selector);
      w[17] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[16] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[15] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[14] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[13] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[12] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[11] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[10] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[ 9] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 8] = hc_byte_perm (    0, w[ 0], selector);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_byte_perm (w[53], w[54], selector);
      w[62] = hc_byte_perm (w[52], w[53], selector);
      w[61] = hc_byte_perm (w[51], w[52], selector);
      w[60] = hc_byte_perm (w[50], w[51], selector);
      w[59] = hc_byte_perm (w[49], w[50], selector);
      w[58] = hc_byte_perm (w[48], w[49], selector);
      w[57] = hc_byte_perm (w[47], w[48], selector);
      w[56] = hc_byte_perm (w[46], w[47], selector);
      w[55] = hc_byte_perm (w[45], w[46], selector);
      w[54] = hc_byte_perm (w[44], w[45], selector);
      w[53] = hc_byte_perm (w[43], w[44], selector);
      w[52] = hc_byte_perm (w[42], w[43], selector);
      w[51] = hc_byte_perm (w[41], w[42], selector);
      w[50] = hc_byte_perm (w[40], w[41], selector);
      w[49] = hc_byte_perm (w[39], w[40], selector);
      w[48] = hc_byte_perm (w[38], w[39], selector);
      w[47] = hc_byte_perm (w[37], w[38], selector);
      w[46] = hc_byte_perm (w[36], w[37], selector);
      w[45] = hc_byte_perm (w[35], w[36], selector);
      w[44] = hc_byte_perm (w[34], w[35], selector);
      w[43] = hc_byte_perm (w[33], w[34], selector);
      w[42] = hc_byte_perm (w[32], w[33], selector);
      w[41] = hc_byte_perm (w[31], w[32], selector);
      w[40] = hc_byte_perm (w[30], w[31], selector);
      w[39] = hc_byte_perm (w[29], w[30], selector);
      w[38] = hc_byte_perm (w[28], w[29], selector);
      w[37] = hc_byte_perm (w[27], w[28], selector);
      w[36] = hc_byte_perm (w[26], w[27], selector);
      w[35] = hc_byte_perm (w[25], w[26], selector);
      w[34] = hc_byte_perm (w[24], w[25], selector);
      w[33] = hc_byte_perm (w[23], w[24], selector);
      w[32] = hc_byte_perm (w[22], w[23], selector);
      w[31] = hc_byte_perm (w[21], w[22], selector);
      w[30] = hc_byte_perm (w[20], w[21], selector);
      w[29] = hc_byte_perm (w[19], w[20], selector);
      w[28] = hc_byte_perm (w[18], w[19], selector);
      w[27] = hc_byte_perm (w[17], w[18], selector);
      w[26] = hc_byte_perm (w[16], w[17], selector);
      w[25] = hc_byte_perm (w[15], w[16], selector);
      w[24] = hc_byte_perm (w[14], w[15], selector);
      w[23] = hc_byte_perm (w[13], w[14], selector);
      w[22] = hc_byte_perm (w[12], w[13], selector);
      w[21] = hc_byte_perm (w[11], w[12], selector);
      w[20] = hc_byte_perm (w[10], w[11], selector);
      w[19] = hc_byte_perm (w[ 9], w[10], selector);
      w[18] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[17] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[16] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[15] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[14] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[13] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[12] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[11] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[10] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[ 9] = hc_byte_perm (    0, w[ 0], selector);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_byte_perm (w[52], w[53], selector);
      w[62] = hc_byte_perm (w[51], w[52], selector);
      w[61] = hc_byte_perm (w[50], w[51], selector);
      w[60] = hc_byte_perm (w[49], w[50], selector);
      w[59] = hc_byte_perm (w[48], w[49], selector);
      w[58] = hc_byte_perm (w[47], w[48], selector);
      w[57] = hc_byte_perm (w[46], w[47], selector);
      w[56] = hc_byte_perm (w[45], w[46], selector);
      w[55] = hc_byte_perm (w[44], w[45], selector);
      w[54] = hc_byte_perm (w[43], w[44], selector);
      w[53] = hc_byte_perm (w[42], w[43], selector);
      w[52] = hc_byte_perm (w[41], w[42], selector);
      w[51] = hc_byte_perm (w[40], w[41], selector);
      w[50] = hc_byte_perm (w[39], w[40], selector);
      w[49] = hc_byte_perm (w[38], w[39], selector);
      w[48] = hc_byte_perm (w[37], w[38], selector);
      w[47] = hc_byte_perm (w[36], w[37], selector);
      w[46] = hc_byte_perm (w[35], w[36], selector);
      w[45] = hc_byte_perm (w[34], w[35], selector);
      w[44] = hc_byte_perm (w[33], w[34], selector);
      w[43] = hc_byte_perm (w[32], w[33], selector);
      w[42] = hc_byte_perm (w[31], w[32], selector);
      w[41] = hc_byte_perm (w[30], w[31], selector);
      w[40] = hc_byte_perm (w[29], w[30], selector);
      w[39] = hc_byte_perm (w[28], w[29], selector);
      w[38] = hc_byte_perm (w[27], w[28], selector);
      w[37] = hc_byte_perm (w[26], w[27], selector);
      w[36] = hc_byte_perm (w[25], w[26], selector);
      w[35] = hc_byte_perm (w[24], w[25], selector);
      w[34] = hc_byte_perm (w[23], w[24], selector);
      w[33] = hc_byte_perm (w[22], w[23], selector);
      w[32] = hc_byte_perm (w[21], w[22], selector);
      w[31] = hc_byte_perm (w[20], w[21], selector);
      w[30] = hc_byte_perm (w[19], w[20], selector);
      w[29] = hc_byte_perm (w[18], w[19], selector);
      w[28] = hc_byte_perm (w[17], w[18], selector);
      w[27] = hc_byte_perm (w[16], w[17], selector);
      w[26] = hc_byte_perm (w[15], w[16], selector);
      w[25] = hc_byte_perm (w[14], w[15], selector);
      w[24] = hc_byte_perm (w[13], w[14], selector);
      w[23] = hc_byte_perm (w[12], w[13], selector);
      w[22] = hc_byte_perm (w[11], w[12], selector);
      w[21] = hc_byte_perm (w[10], w[11], selector);
      w[20] = hc_byte_perm (w[ 9], w[10], selector);
      w[19] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[18] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[17] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[16] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[15] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[14] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[13] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[12] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[11] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[10] = hc_byte_perm (    0, w[ 0], selector);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_byte_perm (w[51], w[52], selector);
      w[62] = hc_byte_perm (w[50], w[51], selector);
      w[61] = hc_byte_perm (w[49], w[50], selector);
      w[60] = hc_byte_perm (w[48], w[49], selector);
      w[59] = hc_byte_perm (w[47], w[48], selector);
      w[58] = hc_byte_perm (w[46], w[47], selector);
      w[57] = hc_byte_perm (w[45], w[46], selector);
      w[56] = hc_byte_perm (w[44], w[45], selector);
      w[55] = hc_byte_perm (w[43], w[44], selector);
      w[54] = hc_byte_perm (w[42], w[43], selector);
      w[53] = hc_byte_perm (w[41], w[42], selector);
      w[52] = hc_byte_perm (w[40], w[41], selector);
      w[51] = hc_byte_perm (w[39], w[40], selector);
      w[50] = hc_byte_perm (w[38], w[39], selector);
      w[49] = hc_byte_perm (w[37], w[38], selector);
      w[48] = hc_byte_perm (w[36], w[37], selector);
      w[47] = hc_byte_perm (w[35], w[36], selector);
      w[46] = hc_byte_perm (w[34], w[35], selector);
      w[45] = hc_byte_perm (w[33], w[34], selector);
      w[44] = hc_byte_perm (w[32], w[33], selector);
      w[43] = hc_byte_perm (w[31], w[32], selector);
      w[42] = hc_byte_perm (w[30], w[31], selector);
      w[41] = hc_byte_perm (w[29], w[30], selector);
      w[40] = hc_byte_perm (w[28], w[29], selector);
      w[39] = hc_byte_perm (w[27], w[28], selector);
      w[38] = hc_byte_perm (w[26], w[27], selector);
      w[37] = hc_byte_perm (w[25], w[26], selector);
      w[36] = hc_byte_perm (w[24], w[25], selector);
      w[35] = hc_byte_perm (w[23], w[24], selector);
      w[34] = hc_byte_perm (w[22], w[23], selector);
      w[33] = hc_byte_perm (w[21], w[22], selector);
      w[32] = hc_byte_perm (w[20], w[21], selector);
      w[31] = hc_byte_perm (w[19], w[20], selector);
      w[30] = hc_byte_perm (w[18], w[19], selector);
      w[29] = hc_byte_perm (w[17], w[18], selector);
      w[28] = hc_byte_perm (w[16], w[17], selector);
      w[27] = hc_byte_perm (w[15], w[16], selector);
      w[26] = hc_byte_perm (w[14], w[15], selector);
      w[25] = hc_byte_perm (w[13], w[14], selector);
      w[24] = hc_byte_perm (w[12], w[13], selector);
      w[23] = hc_byte_perm (w[11], w[12], selector);
      w[22] = hc_byte_perm (w[10], w[11], selector);
      w[21] = hc_byte_perm (w[ 9], w[10], selector);
      w[20] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[19] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[18] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[17] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[16] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[15] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[14] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[13] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[12] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[11] = hc_byte_perm (    0, w[ 0], selector);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_byte_perm (w[50], w[51], selector);
      w[62] = hc_byte_perm (w[49], w[50], selector);
      w[61] = hc_byte_perm (w[48], w[49], selector);
      w[60] = hc_byte_perm (w[47], w[48], selector);
      w[59] = hc_byte_perm (w[46], w[47], selector);
      w[58] = hc_byte_perm (w[45], w[46], selector);
      w[57] = hc_byte_perm (w[44], w[45], selector);
      w[56] = hc_byte_perm (w[43], w[44], selector);
      w[55] = hc_byte_perm (w[42], w[43], selector);
      w[54] = hc_byte_perm (w[41], w[42], selector);
      w[53] = hc_byte_perm (w[40], w[41], selector);
      w[52] = hc_byte_perm (w[39], w[40], selector);
      w[51] = hc_byte_perm (w[38], w[39], selector);
      w[50] = hc_byte_perm (w[37], w[38], selector);
      w[49] = hc_byte_perm (w[36], w[37], selector);
      w[48] = hc_byte_perm (w[35], w[36], selector);
      w[47] = hc_byte_perm (w[34], w[35], selector);
      w[46] = hc_byte_perm (w[33], w[34], selector);
      w[45] = hc_byte_perm (w[32], w[33], selector);
      w[44] = hc_byte_perm (w[31], w[32], selector);
      w[43] = hc_byte_perm (w[30], w[31], selector);
      w[42] = hc_byte_perm (w[29], w[30], selector);
      w[41] = hc_byte_perm (w[28], w[29], selector);
      w[40] = hc_byte_perm (w[27], w[28], selector);
      w[39] = hc_byte_perm (w[26], w[27], selector);
      w[38] = hc_byte_perm (w[25], w[26], selector);
      w[37] = hc_byte_perm (w[24], w[25], selector);
      w[36] = hc_byte_perm (w[23], w[24], selector);
      w[35] = hc_byte_perm (w[22], w[23], selector);
      w[34] = hc_byte_perm (w[21], w[22], selector);
      w[33] = hc_byte_perm (w[20], w[21], selector);
      w[32] = hc_byte_perm (w[19], w[20], selector);
      w[31] = hc_byte_perm (w[18], w[19], selector);
      w[30] = hc_byte_perm (w[17], w[18], selector);
      w[29] = hc_byte_perm (w[16], w[17], selector);
      w[28] = hc_byte_perm (w[15], w[16], selector);
      w[27] = hc_byte_perm (w[14], w[15], selector);
      w[26] = hc_byte_perm (w[13], w[14], selector);
      w[25] = hc_byte_perm (w[12], w[13], selector);
      w[24] = hc_byte_perm (w[11], w[12], selector);
      w[23] = hc_byte_perm (w[10], w[11], selector);
      w[22] = hc_byte_perm (w[ 9], w[10], selector);
      w[21] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[20] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[19] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[18] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[17] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[16] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[15] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[14] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[13] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[12] = hc_byte_perm (    0, w[ 0], selector);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_byte_perm (w[49], w[50], selector);
      w[62] = hc_byte_perm (w[48], w[49], selector);
      w[61] = hc_byte_perm (w[47], w[48], selector);
      w[60] = hc_byte_perm (w[46], w[47], selector);
      w[59] = hc_byte_perm (w[45], w[46], selector);
      w[58] = hc_byte_perm (w[44], w[45], selector);
      w[57] = hc_byte_perm (w[43], w[44], selector);
      w[56] = hc_byte_perm (w[42], w[43], selector);
      w[55] = hc_byte_perm (w[41], w[42], selector);
      w[54] = hc_byte_perm (w[40], w[41], selector);
      w[53] = hc_byte_perm (w[39], w[40], selector);
      w[52] = hc_byte_perm (w[38], w[39], selector);
      w[51] = hc_byte_perm (w[37], w[38], selector);
      w[50] = hc_byte_perm (w[36], w[37], selector);
      w[49] = hc_byte_perm (w[35], w[36], selector);
      w[48] = hc_byte_perm (w[34], w[35], selector);
      w[47] = hc_byte_perm (w[33], w[34], selector);
      w[46] = hc_byte_perm (w[32], w[33], selector);
      w[45] = hc_byte_perm (w[31], w[32], selector);
      w[44] = hc_byte_perm (w[30], w[31], selector);
      w[43] = hc_byte_perm (w[29], w[30], selector);
      w[42] = hc_byte_perm (w[28], w[29], selector);
      w[41] = hc_byte_perm (w[27], w[28], selector);
      w[40] = hc_byte_perm (w[26], w[27], selector);
      w[39] = hc_byte_perm (w[25], w[26], selector);
      w[38] = hc_byte_perm (w[24], w[25], selector);
      w[37] = hc_byte_perm (w[23], w[24], selector);
      w[36] = hc_byte_perm (w[22], w[23], selector);
      w[35] = hc_byte_perm (w[21], w[22], selector);
      w[34] = hc_byte_perm (w[20], w[21], selector);
      w[33] = hc_byte_perm (w[19], w[20], selector);
      w[32] = hc_byte_perm (w[18], w[19], selector);
      w[31] = hc_byte_perm (w[17], w[18], selector);
      w[30] = hc_byte_perm (w[16], w[17], selector);
      w[29] = hc_byte_perm (w[15], w[16], selector);
      w[28] = hc_byte_perm (w[14], w[15], selector);
      w[27] = hc_byte_perm (w[13], w[14], selector);
      w[26] = hc_byte_perm (w[12], w[13], selector);
      w[25] = hc_byte_perm (w[11], w[12], selector);
      w[24] = hc_byte_perm (w[10], w[11], selector);
      w[23] = hc_byte_perm (w[ 9], w[10], selector);
      w[22] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[21] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[20] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[19] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[18] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[17] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[16] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[15] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[14] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[13] = hc_byte_perm (    0, w[ 0], selector);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_byte_perm (w[48], w[49], selector);
      w[62] = hc_byte_perm (w[47], w[48], selector);
      w[61] = hc_byte_perm (w[46], w[47], selector);
      w[60] = hc_byte_perm (w[45], w[46], selector);
      w[59] = hc_byte_perm (w[44], w[45], selector);
      w[58] = hc_byte_perm (w[43], w[44], selector);
      w[57] = hc_byte_perm (w[42], w[43], selector);
      w[56] = hc_byte_perm (w[41], w[42], selector);
      w[55] = hc_byte_perm (w[40], w[41], selector);
      w[54] = hc_byte_perm (w[39], w[40], selector);
      w[53] = hc_byte_perm (w[38], w[39], selector);
      w[52] = hc_byte_perm (w[37], w[38], selector);
      w[51] = hc_byte_perm (w[36], w[37], selector);
      w[50] = hc_byte_perm (w[35], w[36], selector);
      w[49] = hc_byte_perm (w[34], w[35], selector);
      w[48] = hc_byte_perm (w[33], w[34], selector);
      w[47] = hc_byte_perm (w[32], w[33], selector);
      w[46] = hc_byte_perm (w[31], w[32], selector);
      w[45] = hc_byte_perm (w[30], w[31], selector);
      w[44] = hc_byte_perm (w[29], w[30], selector);
      w[43] = hc_byte_perm (w[28], w[29], selector);
      w[42] = hc_byte_perm (w[27], w[28], selector);
      w[41] = hc_byte_perm (w[26], w[27], selector);
      w[40] = hc_byte_perm (w[25], w[26], selector);
      w[39] = hc_byte_perm (w[24], w[25], selector);
      w[38] = hc_byte_perm (w[23], w[24], selector);
      w[37] = hc_byte_perm (w[22], w[23], selector);
      w[36] = hc_byte_perm (w[21], w[22], selector);
      w[35] = hc_byte_perm (w[20], w[21], selector);
      w[34] = hc_byte_perm (w[19], w[20], selector);
      w[33] = hc_byte_perm (w[18], w[19], selector);
      w[32] = hc_byte_perm (w[17], w[18], selector);
      w[31] = hc_byte_perm (w[16], w[17], selector);
      w[30] = hc_byte_perm (w[15], w[16], selector);
      w[29] = hc_byte_perm (w[14], w[15], selector);
      w[28] = hc_byte_perm (w[13], w[14], selector);
      w[27] = hc_byte_perm (w[12], w[13], selector);
      w[26] = hc_byte_perm (w[11], w[12], selector);
      w[25] = hc_byte_perm (w[10], w[11], selector);
      w[24] = hc_byte_perm (w[ 9], w[10], selector);
      w[23] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[22] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[21] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[20] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[19] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[18] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[17] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[16] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[15] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[14] = hc_byte_perm (    0, w[ 0], selector);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_byte_perm (w[47], w[48], selector);
      w[62] = hc_byte_perm (w[46], w[47], selector);
      w[61] = hc_byte_perm (w[45], w[46], selector);
      w[60] = hc_byte_perm (w[44], w[45], selector);
      w[59] = hc_byte_perm (w[43], w[44], selector);
      w[58] = hc_byte_perm (w[42], w[43], selector);
      w[57] = hc_byte_perm (w[41], w[42], selector);
      w[56] = hc_byte_perm (w[40], w[41], selector);
      w[55] = hc_byte_perm (w[39], w[40], selector);
      w[54] = hc_byte_perm (w[38], w[39], selector);
      w[53] = hc_byte_perm (w[37], w[38], selector);
      w[52] = hc_byte_perm (w[36], w[37], selector);
      w[51] = hc_byte_perm (w[35], w[36], selector);
      w[50] = hc_byte_perm (w[34], w[35], selector);
      w[49] = hc_byte_perm (w[33], w[34], selector);
      w[48] = hc_byte_perm (w[32], w[33], selector);
      w[47] = hc_byte_perm (w[31], w[32], selector);
      w[46] = hc_byte_perm (w[30], w[31], selector);
      w[45] = hc_byte_perm (w[29], w[30], selector);
      w[44] = hc_byte_perm (w[28], w[29], selector);
      w[43] = hc_byte_perm (w[27], w[28], selector);
      w[42] = hc_byte_perm (w[26], w[27], selector);
      w[41] = hc_byte_perm (w[25], w[26], selector);
      w[40] = hc_byte_perm (w[24], w[25], selector);
      w[39] = hc_byte_perm (w[23], w[24], selector);
      w[38] = hc_byte_perm (w[22], w[23], selector);
      w[37] = hc_byte_perm (w[21], w[22], selector);
      w[36] = hc_byte_perm (w[20], w[21], selector);
      w[35] = hc_byte_perm (w[19], w[20], selector);
      w[34] = hc_byte_perm (w[18], w[19], selector);
      w[33] = hc_byte_perm (w[17], w[18], selector);
      w[32] = hc_byte_perm (w[16], w[17], selector);
      w[31] = hc_byte_perm (w[15], w[16], selector);
      w[30] = hc_byte_perm (w[14], w[15], selector);
      w[29] = hc_byte_perm (w[13], w[14], selector);
      w[28] = hc_byte_perm (w[12], w[13], selector);
      w[27] = hc_byte_perm (w[11], w[12], selector);
      w[26] = hc_byte_perm (w[10], w[11], selector);
      w[25] = hc_byte_perm (w[ 9], w[10], selector);
      w[24] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[23] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[22] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[21] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[20] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[19] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[18] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[17] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[16] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[15] = hc_byte_perm (    0, w[ 0], selector);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_byte_perm (w[46], w[47], selector);
      w[62] = hc_byte_perm (w[45], w[46], selector);
      w[61] = hc_byte_perm (w[44], w[45], selector);
      w[60] = hc_byte_perm (w[43], w[44], selector);
      w[59] = hc_byte_perm (w[42], w[43], selector);
      w[58] = hc_byte_perm (w[41], w[42], selector);
      w[57] = hc_byte_perm (w[40], w[41], selector);
      w[56] = hc_byte_perm (w[39], w[40], selector);
      w[55] = hc_byte_perm (w[38], w[39], selector);
      w[54] = hc_byte_perm (w[37], w[38], selector);
      w[53] = hc_byte_perm (w[36], w[37], selector);
      w[52] = hc_byte_perm (w[35], w[36], selector);
      w[51] = hc_byte_perm (w[34], w[35], selector);
      w[50] = hc_byte_perm (w[33], w[34], selector);
      w[49] = hc_byte_perm (w[32], w[33], selector);
      w[48] = hc_byte_perm (w[31], w[32], selector);
      w[47] = hc_byte_perm (w[30], w[31], selector);
      w[46] = hc_byte_perm (w[29], w[30], selector);
      w[45] = hc_byte_perm (w[28], w[29], selector);
      w[44] = hc_byte_perm (w[27], w[28], selector);
      w[43] = hc_byte_perm (w[26], w[27], selector);
      w[42] = hc_byte_perm (w[25], w[26], selector);
      w[41] = hc_byte_perm (w[24], w[25], selector);
      w[40] = hc_byte_perm (w[23], w[24], selector);
      w[39] = hc_byte_perm (w[22], w[23], selector);
      w[38] = hc_byte_perm (w[21], w[22], selector);
      w[37] = hc_byte_perm (w[20], w[21], selector);
      w[36] = hc_byte_perm (w[19], w[20], selector);
      w[35] = hc_byte_perm (w[18], w[19], selector);
      w[34] = hc_byte_perm (w[17], w[18], selector);
      w[33] = hc_byte_perm (w[16], w[17], selector);
      w[32] = hc_byte_perm (w[15], w[16], selector);
      w[31] = hc_byte_perm (w[14], w[15], selector);
      w[30] = hc_byte_perm (w[13], w[14], selector);
      w[29] = hc_byte_perm (w[12], w[13], selector);
      w[28] = hc_byte_perm (w[11], w[12], selector);
      w[27] = hc_byte_perm (w[10], w[11], selector);
      w[26] = hc_byte_perm (w[ 9], w[10], selector);
      w[25] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[24] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[23] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[22] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[21] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[20] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[19] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[18] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[17] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[16] = hc_byte_perm (    0, w[ 0], selector);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_byte_perm (w[45], w[46], selector);
      w[62] = hc_byte_perm (w[44], w[45], selector);
      w[61] = hc_byte_perm (w[43], w[44], selector);
      w[60] = hc_byte_perm (w[42], w[43], selector);
      w[59] = hc_byte_perm (w[41], w[42], selector);
      w[58] = hc_byte_perm (w[40], w[41], selector);
      w[57] = hc_byte_perm (w[39], w[40], selector);
      w[56] = hc_byte_perm (w[38], w[39], selector);
      w[55] = hc_byte_perm (w[37], w[38], selector);
      w[54] = hc_byte_perm (w[36], w[37], selector);
      w[53] = hc_byte_perm (w[35], w[36], selector);
      w[52] = hc_byte_perm (w[34], w[35], selector);
      w[51] = hc_byte_perm (w[33], w[34], selector);
      w[50] = hc_byte_perm (w[32], w[33], selector);
      w[49] = hc_byte_perm (w[31], w[32], selector);
      w[48] = hc_byte_perm (w[30], w[31], selector);
      w[47] = hc_byte_perm (w[29], w[30], selector);
      w[46] = hc_byte_perm (w[28], w[29], selector);
      w[45] = hc_byte_perm (w[27], w[28], selector);
      w[44] = hc_byte_perm (w[26], w[27], selector);
      w[43] = hc_byte_perm (w[25], w[26], selector);
      w[42] = hc_byte_perm (w[24], w[25], selector);
      w[41] = hc_byte_perm (w[23], w[24], selector);
      w[40] = hc_byte_perm (w[22], w[23], selector);
      w[39] = hc_byte_perm (w[21], w[22], selector);
      w[38] = hc_byte_perm (w[20], w[21], selector);
      w[37] = hc_byte_perm (w[19], w[20], selector);
      w[36] = hc_byte_perm (w[18], w[19], selector);
      w[35] = hc_byte_perm (w[17], w[18], selector);
      w[34] = hc_byte_perm (w[16], w[17], selector);
      w[33] = hc_byte_perm (w[15], w[16], selector);
      w[32] = hc_byte_perm (w[14], w[15], selector);
      w[31] = hc_byte_perm (w[13], w[14], selector);
      w[30] = hc_byte_perm (w[12], w[13], selector);
      w[29] = hc_byte_perm (w[11], w[12], selector);
      w[28] = hc_byte_perm (w[10], w[11], selector);
      w[27] = hc_byte_perm (w[ 9], w[10], selector);
      w[26] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[25] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[24] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[23] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[22] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[21] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[20] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[19] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[18] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[17] = hc_byte_perm (    0, w[ 0], selector);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_byte_perm (w[44], w[45], selector);
      w[62] = hc_byte_perm (w[43], w[44], selector);
      w[61] = hc_byte_perm (w[42], w[43], selector);
      w[60] = hc_byte_perm (w[41], w[42], selector);
      w[59] = hc_byte_perm (w[40], w[41], selector);
      w[58] = hc_byte_perm (w[39], w[40], selector);
      w[57] = hc_byte_perm (w[38], w[39], selector);
      w[56] = hc_byte_perm (w[37], w[38], selector);
      w[55] = hc_byte_perm (w[36], w[37], selector);
      w[54] = hc_byte_perm (w[35], w[36], selector);
      w[53] = hc_byte_perm (w[34], w[35], selector);
      w[52] = hc_byte_perm (w[33], w[34], selector);
      w[51] = hc_byte_perm (w[32], w[33], selector);
      w[50] = hc_byte_perm (w[31], w[32], selector);
      w[49] = hc_byte_perm (w[30], w[31], selector);
      w[48] = hc_byte_perm (w[29], w[30], selector);
      w[47] = hc_byte_perm (w[28], w[29], selector);
      w[46] = hc_byte_perm (w[27], w[28], selector);
      w[45] = hc_byte_perm (w[26], w[27], selector);
      w[44] = hc_byte_perm (w[25], w[26], selector);
      w[43] = hc_byte_perm (w[24], w[25], selector);
      w[42] = hc_byte_perm (w[23], w[24], selector);
      w[41] = hc_byte_perm (w[22], w[23], selector);
      w[40] = hc_byte_perm (w[21], w[22], selector);
      w[39] = hc_byte_perm (w[20], w[21], selector);
      w[38] = hc_byte_perm (w[19], w[20], selector);
      w[37] = hc_byte_perm (w[18], w[19], selector);
      w[36] = hc_byte_perm (w[17], w[18], selector);
      w[35] = hc_byte_perm (w[16], w[17], selector);
      w[34] = hc_byte_perm (w[15], w[16], selector);
      w[33] = hc_byte_perm (w[14], w[15], selector);
      w[32] = hc_byte_perm (w[13], w[14], selector);
      w[31] = hc_byte_perm (w[12], w[13], selector);
      w[30] = hc_byte_perm (w[11], w[12], selector);
      w[29] = hc_byte_perm (w[10], w[11], selector);
      w[28] = hc_byte_perm (w[ 9], w[10], selector);
      w[27] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[26] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[25] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[24] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[23] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[22] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[21] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[20] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[19] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[18] = hc_byte_perm (    0, w[ 0], selector);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_byte_perm (w[43], w[44], selector);
      w[62] = hc_byte_perm (w[42], w[43], selector);
      w[61] = hc_byte_perm (w[41], w[42], selector);
      w[60] = hc_byte_perm (w[40], w[41], selector);
      w[59] = hc_byte_perm (w[39], w[40], selector);
      w[58] = hc_byte_perm (w[38], w[39], selector);
      w[57] = hc_byte_perm (w[37], w[38], selector);
      w[56] = hc_byte_perm (w[36], w[37], selector);
      w[55] = hc_byte_perm (w[35], w[36], selector);
      w[54] = hc_byte_perm (w[34], w[35], selector);
      w[53] = hc_byte_perm (w[33], w[34], selector);
      w[52] = hc_byte_perm (w[32], w[33], selector);
      w[51] = hc_byte_perm (w[31], w[32], selector);
      w[50] = hc_byte_perm (w[30], w[31], selector);
      w[49] = hc_byte_perm (w[29], w[30], selector);
      w[48] = hc_byte_perm (w[28], w[29], selector);
      w[47] = hc_byte_perm (w[27], w[28], selector);
      w[46] = hc_byte_perm (w[26], w[27], selector);
      w[45] = hc_byte_perm (w[25], w[26], selector);
      w[44] = hc_byte_perm (w[24], w[25], selector);
      w[43] = hc_byte_perm (w[23], w[24], selector);
      w[42] = hc_byte_perm (w[22], w[23], selector);
      w[41] = hc_byte_perm (w[21], w[22], selector);
      w[40] = hc_byte_perm (w[20], w[21], selector);
      w[39] = hc_byte_perm (w[19], w[20], selector);
      w[38] = hc_byte_perm (w[18], w[19], selector);
      w[37] = hc_byte_perm (w[17], w[18], selector);
      w[36] = hc_byte_perm (w[16], w[17], selector);
      w[35] = hc_byte_perm (w[15], w[16], selector);
      w[34] = hc_byte_perm (w[14], w[15], selector);
      w[33] = hc_byte_perm (w[13], w[14], selector);
      w[32] = hc_byte_perm (w[12], w[13], selector);
      w[31] = hc_byte_perm (w[11], w[12], selector);
      w[30] = hc_byte_perm (w[10], w[11], selector);
      w[29] = hc_byte_perm (w[ 9], w[10], selector);
      w[28] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[27] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[26] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[25] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[24] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[23] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[22] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[21] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[20] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[19] = hc_byte_perm (    0, w[ 0], selector);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_byte_perm (w[42], w[43], selector);
      w[62] = hc_byte_perm (w[41], w[42], selector);
      w[61] = hc_byte_perm (w[40], w[41], selector);
      w[60] = hc_byte_perm (w[39], w[40], selector);
      w[59] = hc_byte_perm (w[38], w[39], selector);
      w[58] = hc_byte_perm (w[37], w[38], selector);
      w[57] = hc_byte_perm (w[36], w[37], selector);
      w[56] = hc_byte_perm (w[35], w[36], selector);
      w[55] = hc_byte_perm (w[34], w[35], selector);
      w[54] = hc_byte_perm (w[33], w[34], selector);
      w[53] = hc_byte_perm (w[32], w[33], selector);
      w[52] = hc_byte_perm (w[31], w[32], selector);
      w[51] = hc_byte_perm (w[30], w[31], selector);
      w[50] = hc_byte_perm (w[29], w[30], selector);
      w[49] = hc_byte_perm (w[28], w[29], selector);
      w[48] = hc_byte_perm (w[27], w[28], selector);
      w[47] = hc_byte_perm (w[26], w[27], selector);
      w[46] = hc_byte_perm (w[25], w[26], selector);
      w[45] = hc_byte_perm (w[24], w[25], selector);
      w[44] = hc_byte_perm (w[23], w[24], selector);
      w[43] = hc_byte_perm (w[22], w[23], selector);
      w[42] = hc_byte_perm (w[21], w[22], selector);
      w[41] = hc_byte_perm (w[20], w[21], selector);
      w[40] = hc_byte_perm (w[19], w[20], selector);
      w[39] = hc_byte_perm (w[18], w[19], selector);
      w[38] = hc_byte_perm (w[17], w[18], selector);
      w[37] = hc_byte_perm (w[16], w[17], selector);
      w[36] = hc_byte_perm (w[15], w[16], selector);
      w[35] = hc_byte_perm (w[14], w[15], selector);
      w[34] = hc_byte_perm (w[13], w[14], selector);
      w[33] = hc_byte_perm (w[12], w[13], selector);
      w[32] = hc_byte_perm (w[11], w[12], selector);
      w[31] = hc_byte_perm (w[10], w[11], selector);
      w[30] = hc_byte_perm (w[ 9], w[10], selector);
      w[29] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[28] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[27] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[26] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[25] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[24] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[23] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[22] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[21] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[20] = hc_byte_perm (    0, w[ 0], selector);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_byte_perm (w[41], w[42], selector);
      w[62] = hc_byte_perm (w[40], w[41], selector);
      w[61] = hc_byte_perm (w[39], w[40], selector);
      w[60] = hc_byte_perm (w[38], w[39], selector);
      w[59] = hc_byte_perm (w[37], w[38], selector);
      w[58] = hc_byte_perm (w[36], w[37], selector);
      w[57] = hc_byte_perm (w[35], w[36], selector);
      w[56] = hc_byte_perm (w[34], w[35], selector);
      w[55] = hc_byte_perm (w[33], w[34], selector);
      w[54] = hc_byte_perm (w[32], w[33], selector);
      w[53] = hc_byte_perm (w[31], w[32], selector);
      w[52] = hc_byte_perm (w[30], w[31], selector);
      w[51] = hc_byte_perm (w[29], w[30], selector);
      w[50] = hc_byte_perm (w[28], w[29], selector);
      w[49] = hc_byte_perm (w[27], w[28], selector);
      w[48] = hc_byte_perm (w[26], w[27], selector);
      w[47] = hc_byte_perm (w[25], w[26], selector);
      w[46] = hc_byte_perm (w[24], w[25], selector);
      w[45] = hc_byte_perm (w[23], w[24], selector);
      w[44] = hc_byte_perm (w[22], w[23], selector);
      w[43] = hc_byte_perm (w[21], w[22], selector);
      w[42] = hc_byte_perm (w[20], w[21], selector);
      w[41] = hc_byte_perm (w[19], w[20], selector);
      w[40] = hc_byte_perm (w[18], w[19], selector);
      w[39] = hc_byte_perm (w[17], w[18], selector);
      w[38] = hc_byte_perm (w[16], w[17], selector);
      w[37] = hc_byte_perm (w[15], w[16], selector);
      w[36] = hc_byte_perm (w[14], w[15], selector);
      w[35] = hc_byte_perm (w[13], w[14], selector);
      w[34] = hc_byte_perm (w[12], w[13], selector);
      w[33] = hc_byte_perm (w[11], w[12], selector);
      w[32] = hc_byte_perm (w[10], w[11], selector);
      w[31] = hc_byte_perm (w[ 9], w[10], selector);
      w[30] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[29] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[28] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[27] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[26] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[25] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[24] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[23] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[22] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[21] = hc_byte_perm (    0, w[ 0], selector);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_byte_perm (w[40], w[41], selector);
      w[62] = hc_byte_perm (w[39], w[40], selector);
      w[61] = hc_byte_perm (w[38], w[39], selector);
      w[60] = hc_byte_perm (w[37], w[38], selector);
      w[59] = hc_byte_perm (w[36], w[37], selector);
      w[58] = hc_byte_perm (w[35], w[36], selector);
      w[57] = hc_byte_perm (w[34], w[35], selector);
      w[56] = hc_byte_perm (w[33], w[34], selector);
      w[55] = hc_byte_perm (w[32], w[33], selector);
      w[54] = hc_byte_perm (w[31], w[32], selector);
      w[53] = hc_byte_perm (w[30], w[31], selector);
      w[52] = hc_byte_perm (w[29], w[30], selector);
      w[51] = hc_byte_perm (w[28], w[29], selector);
      w[50] = hc_byte_perm (w[27], w[28], selector);
      w[49] = hc_byte_perm (w[26], w[27], selector);
      w[48] = hc_byte_perm (w[25], w[26], selector);
      w[47] = hc_byte_perm (w[24], w[25], selector);
      w[46] = hc_byte_perm (w[23], w[24], selector);
      w[45] = hc_byte_perm (w[22], w[23], selector);
      w[44] = hc_byte_perm (w[21], w[22], selector);
      w[43] = hc_byte_perm (w[20], w[21], selector);
      w[42] = hc_byte_perm (w[19], w[20], selector);
      w[41] = hc_byte_perm (w[18], w[19], selector);
      w[40] = hc_byte_perm (w[17], w[18], selector);
      w[39] = hc_byte_perm (w[16], w[17], selector);
      w[38] = hc_byte_perm (w[15], w[16], selector);
      w[37] = hc_byte_perm (w[14], w[15], selector);
      w[36] = hc_byte_perm (w[13], w[14], selector);
      w[35] = hc_byte_perm (w[12], w[13], selector);
      w[34] = hc_byte_perm (w[11], w[12], selector);
      w[33] = hc_byte_perm (w[10], w[11], selector);
      w[32] = hc_byte_perm (w[ 9], w[10], selector);
      w[31] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[30] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[29] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[28] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[27] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[26] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[25] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[24] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[23] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[22] = hc_byte_perm (    0, w[ 0], selector);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_byte_perm (w[39], w[40], selector);
      w[62] = hc_byte_perm (w[38], w[39], selector);
      w[61] = hc_byte_perm (w[37], w[38], selector);
      w[60] = hc_byte_perm (w[36], w[37], selector);
      w[59] = hc_byte_perm (w[35], w[36], selector);
      w[58] = hc_byte_perm (w[34], w[35], selector);
      w[57] = hc_byte_perm (w[33], w[34], selector);
      w[56] = hc_byte_perm (w[32], w[33], selector);
      w[55] = hc_byte_perm (w[31], w[32], selector);
      w[54] = hc_byte_perm (w[30], w[31], selector);
      w[53] = hc_byte_perm (w[29], w[30], selector);
      w[52] = hc_byte_perm (w[28], w[29], selector);
      w[51] = hc_byte_perm (w[27], w[28], selector);
      w[50] = hc_byte_perm (w[26], w[27], selector);
      w[49] = hc_byte_perm (w[25], w[26], selector);
      w[48] = hc_byte_perm (w[24], w[25], selector);
      w[47] = hc_byte_perm (w[23], w[24], selector);
      w[46] = hc_byte_perm (w[22], w[23], selector);
      w[45] = hc_byte_perm (w[21], w[22], selector);
      w[44] = hc_byte_perm (w[20], w[21], selector);
      w[43] = hc_byte_perm (w[19], w[20], selector);
      w[42] = hc_byte_perm (w[18], w[19], selector);
      w[41] = hc_byte_perm (w[17], w[18], selector);
      w[40] = hc_byte_perm (w[16], w[17], selector);
      w[39] = hc_byte_perm (w[15], w[16], selector);
      w[38] = hc_byte_perm (w[14], w[15], selector);
      w[37] = hc_byte_perm (w[13], w[14], selector);
      w[36] = hc_byte_perm (w[12], w[13], selector);
      w[35] = hc_byte_perm (w[11], w[12], selector);
      w[34] = hc_byte_perm (w[10], w[11], selector);
      w[33] = hc_byte_perm (w[ 9], w[10], selector);
      w[32] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[31] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[30] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[29] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[28] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[27] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[26] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[25] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[24] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[23] = hc_byte_perm (    0, w[ 0], selector);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_byte_perm (w[38], w[39], selector);
      w[62] = hc_byte_perm (w[37], w[38], selector);
      w[61] = hc_byte_perm (w[36], w[37], selector);
      w[60] = hc_byte_perm (w[35], w[36], selector);
      w[59] = hc_byte_perm (w[34], w[35], selector);
      w[58] = hc_byte_perm (w[33], w[34], selector);
      w[57] = hc_byte_perm (w[32], w[33], selector);
      w[56] = hc_byte_perm (w[31], w[32], selector);
      w[55] = hc_byte_perm (w[30], w[31], selector);
      w[54] = hc_byte_perm (w[29], w[30], selector);
      w[53] = hc_byte_perm (w[28], w[29], selector);
      w[52] = hc_byte_perm (w[27], w[28], selector);
      w[51] = hc_byte_perm (w[26], w[27], selector);
      w[50] = hc_byte_perm (w[25], w[26], selector);
      w[49] = hc_byte_perm (w[24], w[25], selector);
      w[48] = hc_byte_perm (w[23], w[24], selector);
      w[47] = hc_byte_perm (w[22], w[23], selector);
      w[46] = hc_byte_perm (w[21], w[22], selector);
      w[45] = hc_byte_perm (w[20], w[21], selector);
      w[44] = hc_byte_perm (w[19], w[20], selector);
      w[43] = hc_byte_perm (w[18], w[19], selector);
      w[42] = hc_byte_perm (w[17], w[18], selector);
      w[41] = hc_byte_perm (w[16], w[17], selector);
      w[40] = hc_byte_perm (w[15], w[16], selector);
      w[39] = hc_byte_perm (w[14], w[15], selector);
      w[38] = hc_byte_perm (w[13], w[14], selector);
      w[37] = hc_byte_perm (w[12], w[13], selector);
      w[36] = hc_byte_perm (w[11], w[12], selector);
      w[35] = hc_byte_perm (w[10], w[11], selector);
      w[34] = hc_byte_perm (w[ 9], w[10], selector);
      w[33] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[32] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[31] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[30] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[29] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[28] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[27] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[26] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[25] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[24] = hc_byte_perm (    0, w[ 0], selector);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_byte_perm (w[37], w[38], selector);
      w[62] = hc_byte_perm (w[36], w[37], selector);
      w[61] = hc_byte_perm (w[35], w[36], selector);
      w[60] = hc_byte_perm (w[34], w[35], selector);
      w[59] = hc_byte_perm (w[33], w[34], selector);
      w[58] = hc_byte_perm (w[32], w[33], selector);
      w[57] = hc_byte_perm (w[31], w[32], selector);
      w[56] = hc_byte_perm (w[30], w[31], selector);
      w[55] = hc_byte_perm (w[29], w[30], selector);
      w[54] = hc_byte_perm (w[28], w[29], selector);
      w[53] = hc_byte_perm (w[27], w[28], selector);
      w[52] = hc_byte_perm (w[26], w[27], selector);
      w[51] = hc_byte_perm (w[25], w[26], selector);
      w[50] = hc_byte_perm (w[24], w[25], selector);
      w[49] = hc_byte_perm (w[23], w[24], selector);
      w[48] = hc_byte_perm (w[22], w[23], selector);
      w[47] = hc_byte_perm (w[21], w[22], selector);
      w[46] = hc_byte_perm (w[20], w[21], selector);
      w[45] = hc_byte_perm (w[19], w[20], selector);
      w[44] = hc_byte_perm (w[18], w[19], selector);
      w[43] = hc_byte_perm (w[17], w[18], selector);
      w[42] = hc_byte_perm (w[16], w[17], selector);
      w[41] = hc_byte_perm (w[15], w[16], selector);
      w[40] = hc_byte_perm (w[14], w[15], selector);
      w[39] = hc_byte_perm (w[13], w[14], selector);
      w[38] = hc_byte_perm (w[12], w[13], selector);
      w[37] = hc_byte_perm (w[11], w[12], selector);
      w[36] = hc_byte_perm (w[10], w[11], selector);
      w[35] = hc_byte_perm (w[ 9], w[10], selector);
      w[34] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[33] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[32] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[31] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[30] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[29] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[28] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[27] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[26] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[25] = hc_byte_perm (    0, w[ 0], selector);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_byte_perm (w[36], w[37], selector);
      w[62] = hc_byte_perm (w[35], w[36], selector);
      w[61] = hc_byte_perm (w[34], w[35], selector);
      w[60] = hc_byte_perm (w[33], w[34], selector);
      w[59] = hc_byte_perm (w[32], w[33], selector);
      w[58] = hc_byte_perm (w[31], w[32], selector);
      w[57] = hc_byte_perm (w[30], w[31], selector);
      w[56] = hc_byte_perm (w[29], w[30], selector);
      w[55] = hc_byte_perm (w[28], w[29], selector);
      w[54] = hc_byte_perm (w[27], w[28], selector);
      w[53] = hc_byte_perm (w[26], w[27], selector);
      w[52] = hc_byte_perm (w[25], w[26], selector);
      w[51] = hc_byte_perm (w[24], w[25], selector);
      w[50] = hc_byte_perm (w[23], w[24], selector);
      w[49] = hc_byte_perm (w[22], w[23], selector);
      w[48] = hc_byte_perm (w[21], w[22], selector);
      w[47] = hc_byte_perm (w[20], w[21], selector);
      w[46] = hc_byte_perm (w[19], w[20], selector);
      w[45] = hc_byte_perm (w[18], w[19], selector);
      w[44] = hc_byte_perm (w[17], w[18], selector);
      w[43] = hc_byte_perm (w[16], w[17], selector);
      w[42] = hc_byte_perm (w[15], w[16], selector);
      w[41] = hc_byte_perm (w[14], w[15], selector);
      w[40] = hc_byte_perm (w[13], w[14], selector);
      w[39] = hc_byte_perm (w[12], w[13], selector);
      w[38] = hc_byte_perm (w[11], w[12], selector);
      w[37] = hc_byte_perm (w[10], w[11], selector);
      w[36] = hc_byte_perm (w[ 9], w[10], selector);
      w[35] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[34] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[33] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[32] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[31] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[30] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[29] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[28] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[27] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[26] = hc_byte_perm (    0, w[ 0], selector);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_byte_perm (w[35], w[36], selector);
      w[62] = hc_byte_perm (w[34], w[35], selector);
      w[61] = hc_byte_perm (w[33], w[34], selector);
      w[60] = hc_byte_perm (w[32], w[33], selector);
      w[59] = hc_byte_perm (w[31], w[32], selector);
      w[58] = hc_byte_perm (w[30], w[31], selector);
      w[57] = hc_byte_perm (w[29], w[30], selector);
      w[56] = hc_byte_perm (w[28], w[29], selector);
      w[55] = hc_byte_perm (w[27], w[28], selector);
      w[54] = hc_byte_perm (w[26], w[27], selector);
      w[53] = hc_byte_perm (w[25], w[26], selector);
      w[52] = hc_byte_perm (w[24], w[25], selector);
      w[51] = hc_byte_perm (w[23], w[24], selector);
      w[50] = hc_byte_perm (w[22], w[23], selector);
      w[49] = hc_byte_perm (w[21], w[22], selector);
      w[48] = hc_byte_perm (w[20], w[21], selector);
      w[47] = hc_byte_perm (w[19], w[20], selector);
      w[46] = hc_byte_perm (w[18], w[19], selector);
      w[45] = hc_byte_perm (w[17], w[18], selector);
      w[44] = hc_byte_perm (w[16], w[17], selector);
      w[43] = hc_byte_perm (w[15], w[16], selector);
      w[42] = hc_byte_perm (w[14], w[15], selector);
      w[41] = hc_byte_perm (w[13], w[14], selector);
      w[40] = hc_byte_perm (w[12], w[13], selector);
      w[39] = hc_byte_perm (w[11], w[12], selector);
      w[38] = hc_byte_perm (w[10], w[11], selector);
      w[37] = hc_byte_perm (w[ 9], w[10], selector);
      w[36] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[35] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[34] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[33] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[32] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[31] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[30] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[29] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[28] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[27] = hc_byte_perm (    0, w[ 0], selector);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_byte_perm (w[34], w[35], selector);
      w[62] = hc_byte_perm (w[33], w[34], selector);
      w[61] = hc_byte_perm (w[32], w[33], selector);
      w[60] = hc_byte_perm (w[31], w[32], selector);
      w[59] = hc_byte_perm (w[30], w[31], selector);
      w[58] = hc_byte_perm (w[29], w[30], selector);
      w[57] = hc_byte_perm (w[28], w[29], selector);
      w[56] = hc_byte_perm (w[27], w[28], selector);
      w[55] = hc_byte_perm (w[26], w[27], selector);
      w[54] = hc_byte_perm (w[25], w[26], selector);
      w[53] = hc_byte_perm (w[24], w[25], selector);
      w[52] = hc_byte_perm (w[23], w[24], selector);
      w[51] = hc_byte_perm (w[22], w[23], selector);
      w[50] = hc_byte_perm (w[21], w[22], selector);
      w[49] = hc_byte_perm (w[20], w[21], selector);
      w[48] = hc_byte_perm (w[19], w[20], selector);
      w[47] = hc_byte_perm (w[18], w[19], selector);
      w[46] = hc_byte_perm (w[17], w[18], selector);
      w[45] = hc_byte_perm (w[16], w[17], selector);
      w[44] = hc_byte_perm (w[15], w[16], selector);
      w[43] = hc_byte_perm (w[14], w[15], selector);
      w[42] = hc_byte_perm (w[13], w[14], selector);
      w[41] = hc_byte_perm (w[12], w[13], selector);
      w[40] = hc_byte_perm (w[11], w[12], selector);
      w[39] = hc_byte_perm (w[10], w[11], selector);
      w[38] = hc_byte_perm (w[ 9], w[10], selector);
      w[37] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[36] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[35] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[34] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[33] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[32] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[31] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[30] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[29] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[28] = hc_byte_perm (    0, w[ 0], selector);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_byte_perm (w[33], w[34], selector);
      w[62] = hc_byte_perm (w[32], w[33], selector);
      w[61] = hc_byte_perm (w[31], w[32], selector);
      w[60] = hc_byte_perm (w[30], w[31], selector);
      w[59] = hc_byte_perm (w[29], w[30], selector);
      w[58] = hc_byte_perm (w[28], w[29], selector);
      w[57] = hc_byte_perm (w[27], w[28], selector);
      w[56] = hc_byte_perm (w[26], w[27], selector);
      w[55] = hc_byte_perm (w[25], w[26], selector);
      w[54] = hc_byte_perm (w[24], w[25], selector);
      w[53] = hc_byte_perm (w[23], w[24], selector);
      w[52] = hc_byte_perm (w[22], w[23], selector);
      w[51] = hc_byte_perm (w[21], w[22], selector);
      w[50] = hc_byte_perm (w[20], w[21], selector);
      w[49] = hc_byte_perm (w[19], w[20], selector);
      w[48] = hc_byte_perm (w[18], w[19], selector);
      w[47] = hc_byte_perm (w[17], w[18], selector);
      w[46] = hc_byte_perm (w[16], w[17], selector);
      w[45] = hc_byte_perm (w[15], w[16], selector);
      w[44] = hc_byte_perm (w[14], w[15], selector);
      w[43] = hc_byte_perm (w[13], w[14], selector);
      w[42] = hc_byte_perm (w[12], w[13], selector);
      w[41] = hc_byte_perm (w[11], w[12], selector);
      w[40] = hc_byte_perm (w[10], w[11], selector);
      w[39] = hc_byte_perm (w[ 9], w[10], selector);
      w[38] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[37] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[36] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[35] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[34] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[33] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[32] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[31] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[30] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[29] = hc_byte_perm (    0, w[ 0], selector);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_byte_perm (w[32], w[33], selector);
      w[62] = hc_byte_perm (w[31], w[32], selector);
      w[61] = hc_byte_perm (w[30], w[31], selector);
      w[60] = hc_byte_perm (w[29], w[30], selector);
      w[59] = hc_byte_perm (w[28], w[29], selector);
      w[58] = hc_byte_perm (w[27], w[28], selector);
      w[57] = hc_byte_perm (w[26], w[27], selector);
      w[56] = hc_byte_perm (w[25], w[26], selector);
      w[55] = hc_byte_perm (w[24], w[25], selector);
      w[54] = hc_byte_perm (w[23], w[24], selector);
      w[53] = hc_byte_perm (w[22], w[23], selector);
      w[52] = hc_byte_perm (w[21], w[22], selector);
      w[51] = hc_byte_perm (w[20], w[21], selector);
      w[50] = hc_byte_perm (w[19], w[20], selector);
      w[49] = hc_byte_perm (w[18], w[19], selector);
      w[48] = hc_byte_perm (w[17], w[18], selector);
      w[47] = hc_byte_perm (w[16], w[17], selector);
      w[46] = hc_byte_perm (w[15], w[16], selector);
      w[45] = hc_byte_perm (w[14], w[15], selector);
      w[44] = hc_byte_perm (w[13], w[14], selector);
      w[43] = hc_byte_perm (w[12], w[13], selector);
      w[42] = hc_byte_perm (w[11], w[12], selector);
      w[41] = hc_byte_perm (w[10], w[11], selector);
      w[40] = hc_byte_perm (w[ 9], w[10], selector);
      w[39] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[38] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[37] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[36] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[35] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[34] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[33] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[32] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[31] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[30] = hc_byte_perm (    0, w[ 0], selector);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_byte_perm (w[31], w[32], selector);
      w[62] = hc_byte_perm (w[30], w[31], selector);
      w[61] = hc_byte_perm (w[29], w[30], selector);
      w[60] = hc_byte_perm (w[28], w[29], selector);
      w[59] = hc_byte_perm (w[27], w[28], selector);
      w[58] = hc_byte_perm (w[26], w[27], selector);
      w[57] = hc_byte_perm (w[25], w[26], selector);
      w[56] = hc_byte_perm (w[24], w[25], selector);
      w[55] = hc_byte_perm (w[23], w[24], selector);
      w[54] = hc_byte_perm (w[22], w[23], selector);
      w[53] = hc_byte_perm (w[21], w[22], selector);
      w[52] = hc_byte_perm (w[20], w[21], selector);
      w[51] = hc_byte_perm (w[19], w[20], selector);
      w[50] = hc_byte_perm (w[18], w[19], selector);
      w[49] = hc_byte_perm (w[17], w[18], selector);
      w[48] = hc_byte_perm (w[16], w[17], selector);
      w[47] = hc_byte_perm (w[15], w[16], selector);
      w[46] = hc_byte_perm (w[14], w[15], selector);
      w[45] = hc_byte_perm (w[13], w[14], selector);
      w[44] = hc_byte_perm (w[12], w[13], selector);
      w[43] = hc_byte_perm (w[11], w[12], selector);
      w[42] = hc_byte_perm (w[10], w[11], selector);
      w[41] = hc_byte_perm (w[ 9], w[10], selector);
      w[40] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[39] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[38] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[37] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[36] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[35] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[34] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[33] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[32] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[31] = hc_byte_perm (    0, w[ 0], selector);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_byte_perm (w[30], w[31], selector);
      w[62] = hc_byte_perm (w[29], w[30], selector);
      w[61] = hc_byte_perm (w[28], w[29], selector);
      w[60] = hc_byte_perm (w[27], w[28], selector);
      w[59] = hc_byte_perm (w[26], w[27], selector);
      w[58] = hc_byte_perm (w[25], w[26], selector);
      w[57] = hc_byte_perm (w[24], w[25], selector);
      w[56] = hc_byte_perm (w[23], w[24], selector);
      w[55] = hc_byte_perm (w[22], w[23], selector);
      w[54] = hc_byte_perm (w[21], w[22], selector);
      w[53] = hc_byte_perm (w[20], w[21], selector);
      w[52] = hc_byte_perm (w[19], w[20], selector);
      w[51] = hc_byte_perm (w[18], w[19], selector);
      w[50] = hc_byte_perm (w[17], w[18], selector);
      w[49] = hc_byte_perm (w[16], w[17], selector);
      w[48] = hc_byte_perm (w[15], w[16], selector);
      w[47] = hc_byte_perm (w[14], w[15], selector);
      w[46] = hc_byte_perm (w[13], w[14], selector);
      w[45] = hc_byte_perm (w[12], w[13], selector);
      w[44] = hc_byte_perm (w[11], w[12], selector);
      w[43] = hc_byte_perm (w[10], w[11], selector);
      w[42] = hc_byte_perm (w[ 9], w[10], selector);
      w[41] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[40] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[39] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[38] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[37] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[36] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[35] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[34] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[33] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[32] = hc_byte_perm (    0, w[ 0], selector);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_byte_perm (w[29], w[30], selector);
      w[62] = hc_byte_perm (w[28], w[29], selector);
      w[61] = hc_byte_perm (w[27], w[28], selector);
      w[60] = hc_byte_perm (w[26], w[27], selector);
      w[59] = hc_byte_perm (w[25], w[26], selector);
      w[58] = hc_byte_perm (w[24], w[25], selector);
      w[57] = hc_byte_perm (w[23], w[24], selector);
      w[56] = hc_byte_perm (w[22], w[23], selector);
      w[55] = hc_byte_perm (w[21], w[22], selector);
      w[54] = hc_byte_perm (w[20], w[21], selector);
      w[53] = hc_byte_perm (w[19], w[20], selector);
      w[52] = hc_byte_perm (w[18], w[19], selector);
      w[51] = hc_byte_perm (w[17], w[18], selector);
      w[50] = hc_byte_perm (w[16], w[17], selector);
      w[49] = hc_byte_perm (w[15], w[16], selector);
      w[48] = hc_byte_perm (w[14], w[15], selector);
      w[47] = hc_byte_perm (w[13], w[14], selector);
      w[46] = hc_byte_perm (w[12], w[13], selector);
      w[45] = hc_byte_perm (w[11], w[12], selector);
      w[44] = hc_byte_perm (w[10], w[11], selector);
      w[43] = hc_byte_perm (w[ 9], w[10], selector);
      w[42] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[41] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[40] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[39] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[38] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[37] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[36] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[35] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[34] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[33] = hc_byte_perm (    0, w[ 0], selector);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_byte_perm (w[28], w[29], selector);
      w[62] = hc_byte_perm (w[27], w[28], selector);
      w[61] = hc_byte_perm (w[26], w[27], selector);
      w[60] = hc_byte_perm (w[25], w[26], selector);
      w[59] = hc_byte_perm (w[24], w[25], selector);
      w[58] = hc_byte_perm (w[23], w[24], selector);
      w[57] = hc_byte_perm (w[22], w[23], selector);
      w[56] = hc_byte_perm (w[21], w[22], selector);
      w[55] = hc_byte_perm (w[20], w[21], selector);
      w[54] = hc_byte_perm (w[19], w[20], selector);
      w[53] = hc_byte_perm (w[18], w[19], selector);
      w[52] = hc_byte_perm (w[17], w[18], selector);
      w[51] = hc_byte_perm (w[16], w[17], selector);
      w[50] = hc_byte_perm (w[15], w[16], selector);
      w[49] = hc_byte_perm (w[14], w[15], selector);
      w[48] = hc_byte_perm (w[13], w[14], selector);
      w[47] = hc_byte_perm (w[12], w[13], selector);
      w[46] = hc_byte_perm (w[11], w[12], selector);
      w[45] = hc_byte_perm (w[10], w[11], selector);
      w[44] = hc_byte_perm (w[ 9], w[10], selector);
      w[43] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[42] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[41] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[40] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[39] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[38] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[37] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[36] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[35] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[34] = hc_byte_perm (    0, w[ 0], selector);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_byte_perm (w[27], w[28], selector);
      w[62] = hc_byte_perm (w[26], w[27], selector);
      w[61] = hc_byte_perm (w[25], w[26], selector);
      w[60] = hc_byte_perm (w[24], w[25], selector);
      w[59] = hc_byte_perm (w[23], w[24], selector);
      w[58] = hc_byte_perm (w[22], w[23], selector);
      w[57] = hc_byte_perm (w[21], w[22], selector);
      w[56] = hc_byte_perm (w[20], w[21], selector);
      w[55] = hc_byte_perm (w[19], w[20], selector);
      w[54] = hc_byte_perm (w[18], w[19], selector);
      w[53] = hc_byte_perm (w[17], w[18], selector);
      w[52] = hc_byte_perm (w[16], w[17], selector);
      w[51] = hc_byte_perm (w[15], w[16], selector);
      w[50] = hc_byte_perm (w[14], w[15], selector);
      w[49] = hc_byte_perm (w[13], w[14], selector);
      w[48] = hc_byte_perm (w[12], w[13], selector);
      w[47] = hc_byte_perm (w[11], w[12], selector);
      w[46] = hc_byte_perm (w[10], w[11], selector);
      w[45] = hc_byte_perm (w[ 9], w[10], selector);
      w[44] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[43] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[42] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[41] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[40] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[39] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[38] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[37] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[36] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[35] = hc_byte_perm (    0, w[ 0], selector);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_byte_perm (w[26], w[27], selector);
      w[62] = hc_byte_perm (w[25], w[26], selector);
      w[61] = hc_byte_perm (w[24], w[25], selector);
      w[60] = hc_byte_perm (w[23], w[24], selector);
      w[59] = hc_byte_perm (w[22], w[23], selector);
      w[58] = hc_byte_perm (w[21], w[22], selector);
      w[57] = hc_byte_perm (w[20], w[21], selector);
      w[56] = hc_byte_perm (w[19], w[20], selector);
      w[55] = hc_byte_perm (w[18], w[19], selector);
      w[54] = hc_byte_perm (w[17], w[18], selector);
      w[53] = hc_byte_perm (w[16], w[17], selector);
      w[52] = hc_byte_perm (w[15], w[16], selector);
      w[51] = hc_byte_perm (w[14], w[15], selector);
      w[50] = hc_byte_perm (w[13], w[14], selector);
      w[49] = hc_byte_perm (w[12], w[13], selector);
      w[48] = hc_byte_perm (w[11], w[12], selector);
      w[47] = hc_byte_perm (w[10], w[11], selector);
      w[46] = hc_byte_perm (w[ 9], w[10], selector);
      w[45] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[44] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[43] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[42] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[41] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[40] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[39] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[38] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[37] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[36] = hc_byte_perm (    0, w[ 0], selector);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_byte_perm (w[25], w[26], selector);
      w[62] = hc_byte_perm (w[24], w[25], selector);
      w[61] = hc_byte_perm (w[23], w[24], selector);
      w[60] = hc_byte_perm (w[22], w[23], selector);
      w[59] = hc_byte_perm (w[21], w[22], selector);
      w[58] = hc_byte_perm (w[20], w[21], selector);
      w[57] = hc_byte_perm (w[19], w[20], selector);
      w[56] = hc_byte_perm (w[18], w[19], selector);
      w[55] = hc_byte_perm (w[17], w[18], selector);
      w[54] = hc_byte_perm (w[16], w[17], selector);
      w[53] = hc_byte_perm (w[15], w[16], selector);
      w[52] = hc_byte_perm (w[14], w[15], selector);
      w[51] = hc_byte_perm (w[13], w[14], selector);
      w[50] = hc_byte_perm (w[12], w[13], selector);
      w[49] = hc_byte_perm (w[11], w[12], selector);
      w[48] = hc_byte_perm (w[10], w[11], selector);
      w[47] = hc_byte_perm (w[ 9], w[10], selector);
      w[46] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[45] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[44] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[43] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[42] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[41] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[40] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[39] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[38] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[37] = hc_byte_perm (    0, w[ 0], selector);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_byte_perm (w[24], w[25], selector);
      w[62] = hc_byte_perm (w[23], w[24], selector);
      w[61] = hc_byte_perm (w[22], w[23], selector);
      w[60] = hc_byte_perm (w[21], w[22], selector);
      w[59] = hc_byte_perm (w[20], w[21], selector);
      w[58] = hc_byte_perm (w[19], w[20], selector);
      w[57] = hc_byte_perm (w[18], w[19], selector);
      w[56] = hc_byte_perm (w[17], w[18], selector);
      w[55] = hc_byte_perm (w[16], w[17], selector);
      w[54] = hc_byte_perm (w[15], w[16], selector);
      w[53] = hc_byte_perm (w[14], w[15], selector);
      w[52] = hc_byte_perm (w[13], w[14], selector);
      w[51] = hc_byte_perm (w[12], w[13], selector);
      w[50] = hc_byte_perm (w[11], w[12], selector);
      w[49] = hc_byte_perm (w[10], w[11], selector);
      w[48] = hc_byte_perm (w[ 9], w[10], selector);
      w[47] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[46] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[45] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[44] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[43] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[42] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[41] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[40] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[39] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[38] = hc_byte_perm (    0, w[ 0], selector);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_byte_perm (w[23], w[24], selector);
      w[62] = hc_byte_perm (w[22], w[23], selector);
      w[61] = hc_byte_perm (w[21], w[22], selector);
      w[60] = hc_byte_perm (w[20], w[21], selector);
      w[59] = hc_byte_perm (w[19], w[20], selector);
      w[58] = hc_byte_perm (w[18], w[19], selector);
      w[57] = hc_byte_perm (w[17], w[18], selector);
      w[56] = hc_byte_perm (w[16], w[17], selector);
      w[55] = hc_byte_perm (w[15], w[16], selector);
      w[54] = hc_byte_perm (w[14], w[15], selector);
      w[53] = hc_byte_perm (w[13], w[14], selector);
      w[52] = hc_byte_perm (w[12], w[13], selector);
      w[51] = hc_byte_perm (w[11], w[12], selector);
      w[50] = hc_byte_perm (w[10], w[11], selector);
      w[49] = hc_byte_perm (w[ 9], w[10], selector);
      w[48] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[47] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[46] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[45] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[44] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[43] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[42] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[41] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[40] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[39] = hc_byte_perm (    0, w[ 0], selector);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_byte_perm (w[22], w[23], selector);
      w[62] = hc_byte_perm (w[21], w[22], selector);
      w[61] = hc_byte_perm (w[20], w[21], selector);
      w[60] = hc_byte_perm (w[19], w[20], selector);
      w[59] = hc_byte_perm (w[18], w[19], selector);
      w[58] = hc_byte_perm (w[17], w[18], selector);
      w[57] = hc_byte_perm (w[16], w[17], selector);
      w[56] = hc_byte_perm (w[15], w[16], selector);
      w[55] = hc_byte_perm (w[14], w[15], selector);
      w[54] = hc_byte_perm (w[13], w[14], selector);
      w[53] = hc_byte_perm (w[12], w[13], selector);
      w[52] = hc_byte_perm (w[11], w[12], selector);
      w[51] = hc_byte_perm (w[10], w[11], selector);
      w[50] = hc_byte_perm (w[ 9], w[10], selector);
      w[49] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[48] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[47] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[46] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[45] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[44] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[43] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[42] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[41] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[40] = hc_byte_perm (    0, w[ 0], selector);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_byte_perm (w[21], w[22], selector);
      w[62] = hc_byte_perm (w[20], w[21], selector);
      w[61] = hc_byte_perm (w[19], w[20], selector);
      w[60] = hc_byte_perm (w[18], w[19], selector);
      w[59] = hc_byte_perm (w[17], w[18], selector);
      w[58] = hc_byte_perm (w[16], w[17], selector);
      w[57] = hc_byte_perm (w[15], w[16], selector);
      w[56] = hc_byte_perm (w[14], w[15], selector);
      w[55] = hc_byte_perm (w[13], w[14], selector);
      w[54] = hc_byte_perm (w[12], w[13], selector);
      w[53] = hc_byte_perm (w[11], w[12], selector);
      w[52] = hc_byte_perm (w[10], w[11], selector);
      w[51] = hc_byte_perm (w[ 9], w[10], selector);
      w[50] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[49] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[48] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[47] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[46] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[45] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[44] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[43] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[42] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[41] = hc_byte_perm (    0, w[ 0], selector);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_byte_perm (w[20], w[21], selector);
      w[62] = hc_byte_perm (w[19], w[20], selector);
      w[61] = hc_byte_perm (w[18], w[19], selector);
      w[60] = hc_byte_perm (w[17], w[18], selector);
      w[59] = hc_byte_perm (w[16], w[17], selector);
      w[58] = hc_byte_perm (w[15], w[16], selector);
      w[57] = hc_byte_perm (w[14], w[15], selector);
      w[56] = hc_byte_perm (w[13], w[14], selector);
      w[55] = hc_byte_perm (w[12], w[13], selector);
      w[54] = hc_byte_perm (w[11], w[12], selector);
      w[53] = hc_byte_perm (w[10], w[11], selector);
      w[52] = hc_byte_perm (w[ 9], w[10], selector);
      w[51] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[50] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[49] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[48] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[47] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[46] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[45] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[44] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[43] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[42] = hc_byte_perm (    0, w[ 0], selector);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_byte_perm (w[19], w[20], selector);
      w[62] = hc_byte_perm (w[18], w[19], selector);
      w[61] = hc_byte_perm (w[17], w[18], selector);
      w[60] = hc_byte_perm (w[16], w[17], selector);
      w[59] = hc_byte_perm (w[15], w[16], selector);
      w[58] = hc_byte_perm (w[14], w[15], selector);
      w[57] = hc_byte_perm (w[13], w[14], selector);
      w[56] = hc_byte_perm (w[12], w[13], selector);
      w[55] = hc_byte_perm (w[11], w[12], selector);
      w[54] = hc_byte_perm (w[10], w[11], selector);
      w[53] = hc_byte_perm (w[ 9], w[10], selector);
      w[52] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[51] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[50] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[49] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[48] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[47] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[46] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[45] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[44] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[43] = hc_byte_perm (    0, w[ 0], selector);
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 44:
      w[63] = hc_byte_perm (w[18], w[19], selector);
      w[62] = hc_byte_perm (w[17], w[18], selector);
      w[61] = hc_byte_perm (w[16], w[17], selector);
      w[60] = hc_byte_perm (w[15], w[16], selector);
      w[59] = hc_byte_perm (w[14], w[15], selector);
      w[58] = hc_byte_perm (w[13], w[14], selector);
      w[57] = hc_byte_perm (w[12], w[13], selector);
      w[56] = hc_byte_perm (w[11], w[12], selector);
      w[55] = hc_byte_perm (w[10], w[11], selector);
      w[54] = hc_byte_perm (w[ 9], w[10], selector);
      w[53] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[52] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[51] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[50] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[49] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[48] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[47] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[46] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[45] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[44] = hc_byte_perm (    0, w[ 0], selector);
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 45:
      w[63] = hc_byte_perm (w[17], w[18], selector);
      w[62] = hc_byte_perm (w[16], w[17], selector);
      w[61] = hc_byte_perm (w[15], w[16], selector);
      w[60] = hc_byte_perm (w[14], w[15], selector);
      w[59] = hc_byte_perm (w[13], w[14], selector);
      w[58] = hc_byte_perm (w[12], w[13], selector);
      w[57] = hc_byte_perm (w[11], w[12], selector);
      w[56] = hc_byte_perm (w[10], w[11], selector);
      w[55] = hc_byte_perm (w[ 9], w[10], selector);
      w[54] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[53] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[52] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[51] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[50] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[49] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[48] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[47] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[46] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[45] = hc_byte_perm (    0, w[ 0], selector);
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 46:
      w[63] = hc_byte_perm (w[16], w[17], selector);
      w[62] = hc_byte_perm (w[15], w[16], selector);
      w[61] = hc_byte_perm (w[14], w[15], selector);
      w[60] = hc_byte_perm (w[13], w[14], selector);
      w[59] = hc_byte_perm (w[12], w[13], selector);
      w[58] = hc_byte_perm (w[11], w[12], selector);
      w[57] = hc_byte_perm (w[10], w[11], selector);
      w[56] = hc_byte_perm (w[ 9], w[10], selector);
      w[55] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[54] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[53] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[52] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[51] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[50] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[49] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[48] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[47] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[46] = hc_byte_perm (    0, w[ 0], selector);
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 47:
      w[63] = hc_byte_perm (w[15], w[16], selector);
      w[62] = hc_byte_perm (w[14], w[15], selector);
      w[61] = hc_byte_perm (w[13], w[14], selector);
      w[60] = hc_byte_perm (w[12], w[13], selector);
      w[59] = hc_byte_perm (w[11], w[12], selector);
      w[58] = hc_byte_perm (w[10], w[11], selector);
      w[57] = hc_byte_perm (w[ 9], w[10], selector);
      w[56] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[55] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[54] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[53] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[52] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[51] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[50] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[49] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[48] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[47] = hc_byte_perm (    0, w[ 0], selector);
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 48:
      w[63] = hc_byte_perm (w[14], w[15], selector);
      w[62] = hc_byte_perm (w[13], w[14], selector);
      w[61] = hc_byte_perm (w[12], w[13], selector);
      w[60] = hc_byte_perm (w[11], w[12], selector);
      w[59] = hc_byte_perm (w[10], w[11], selector);
      w[58] = hc_byte_perm (w[ 9], w[10], selector);
      w[57] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[56] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[55] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[54] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[53] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[52] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[51] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[50] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[49] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[48] = hc_byte_perm (    0, w[ 0], selector);
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 49:
      w[63] = hc_byte_perm (w[13], w[14], selector);
      w[62] = hc_byte_perm (w[12], w[13], selector);
      w[61] = hc_byte_perm (w[11], w[12], selector);
      w[60] = hc_byte_perm (w[10], w[11], selector);
      w[59] = hc_byte_perm (w[ 9], w[10], selector);
      w[58] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[57] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[56] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[55] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[54] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[53] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[52] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[51] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[50] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[49] = hc_byte_perm (    0, w[ 0], selector);
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 50:
      w[63] = hc_byte_perm (w[12], w[13], selector);
      w[62] = hc_byte_perm (w[11], w[12], selector);
      w[61] = hc_byte_perm (w[10], w[11], selector);
      w[60] = hc_byte_perm (w[ 9], w[10], selector);
      w[59] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[58] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[57] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[56] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[55] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[54] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[53] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[52] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[51] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[50] = hc_byte_perm (    0, w[ 0], selector);
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 51:
      w[63] = hc_byte_perm (w[11], w[12], selector);
      w[62] = hc_byte_perm (w[10], w[11], selector);
      w[61] = hc_byte_perm (w[ 9], w[10], selector);
      w[60] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[59] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[58] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[57] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[56] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[55] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[54] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[53] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[52] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[51] = hc_byte_perm (    0, w[ 0], selector);
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 52:
      w[63] = hc_byte_perm (w[10], w[11], selector);
      w[62] = hc_byte_perm (w[ 9], w[10], selector);
      w[61] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[60] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[59] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[58] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[57] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[56] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[55] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[54] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[53] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[52] = hc_byte_perm (    0, w[ 0], selector);
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 53:
      w[63] = hc_byte_perm (w[ 9], w[10], selector);
      w[62] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[61] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[60] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[59] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[58] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[57] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[56] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[55] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[54] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[53] = hc_byte_perm (    0, w[ 0], selector);
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 54:
      w[63] = hc_byte_perm (w[ 8], w[ 9], selector);
      w[62] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[61] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[60] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[59] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[58] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[57] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[56] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[55] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[54] = hc_byte_perm (    0, w[ 0], selector);
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 55:
      w[63] = hc_byte_perm (w[ 7], w[ 8], selector);
      w[62] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[61] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[60] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[59] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[58] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[57] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[56] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[55] = hc_byte_perm (    0, w[ 0], selector);
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 56:
      w[63] = hc_byte_perm (w[ 6], w[ 7], selector);
      w[62] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[61] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[60] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[59] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[58] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[57] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[56] = hc_byte_perm (    0, w[ 0], selector);
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 57:
      w[63] = hc_byte_perm (w[ 5], w[ 6], selector);
      w[62] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[61] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[60] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[59] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[58] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[57] = hc_byte_perm (    0, w[ 0], selector);
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 58:
      w[63] = hc_byte_perm (w[ 4], w[ 5], selector);
      w[62] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[61] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[60] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[59] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[58] = hc_byte_perm (    0, w[ 0], selector);
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 59:
      w[63] = hc_byte_perm (w[ 3], w[ 4], selector);
      w[62] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[61] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[60] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[59] = hc_byte_perm (    0, w[ 0], selector);
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 60:
      w[63] = hc_byte_perm (w[ 2], w[ 3], selector);
      w[62] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[61] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[60] = hc_byte_perm (    0, w[ 0], selector);
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 61:
      w[63] = hc_byte_perm (w[ 1], w[ 2], selector);
      w[62] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[61] = hc_byte_perm (    0, w[ 0], selector);
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 62:
      w[63] = hc_byte_perm (w[ 0], w[ 1], selector);
      w[62] = hc_byte_perm (    0, w[ 0], selector);
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 63:
      w[63] = hc_byte_perm (    0, w[ 0], selector);
      w[62] = 0;
      w[61] = 0;
      w[60] = 0;
      w[59] = 0;
      w[58] = 0;
      w[57] = 0;
      w[56] = 0;
      w[55] = 0;
      w[54] = 0;
      w[53] = 0;
      w[52] = 0;
      w[51] = 0;
      w[50] = 0;
      w[49] = 0;
      w[48] = 0;
      w[47] = 0;
      w[46] = 0;
      w[45] = 0;
      w[44] = 0;
      w[43] = 0;
      w[42] = 0;
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;
  }
  #endif
}

DECLSPEC void switch_buffer_by_offset_1x64_be (u32x *w, const u32 offset)
{
  const int offset_switch = offset / 4;

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
  switch (offset_switch)
  {
    case  0:
      w[63] = hc_bytealign_be (w[62], w[63], offset);
      w[62] = hc_bytealign_be (w[61], w[62], offset);
      w[61] = hc_bytealign_be (w[60], w[61], offset);
      w[60] = hc_bytealign_be (w[59], w[60], offset);
      w[59] = hc_bytealign_be (w[58], w[59], offset);
      w[58] = hc_bytealign_be (w[57], w[58], offset);
      w[57] = hc_bytealign_be (w[56], w[57], offset);
      w[56] = hc_bytealign_be (w[55], w[56], offset);
      w[55] = hc_bytealign_be (w[54], w[55], offset);
      w[54] = hc_bytealign_be (w[53], w[54], offset);
      w[53] = hc_bytealign_be (w[52], w[53], offset);
      w[52] = hc_bytealign_be (w[51], w[52], offset);
      w[51] = hc_bytealign_be (w[50], w[51], offset);
      w[50] = hc_bytealign_be (w[49], w[50], offset);
      w[49] = hc_bytealign_be (w[48], w[49], offset);
      w[48] = hc_bytealign_be (w[47], w[48], offset);
      w[47] = hc_bytealign_be (w[46], w[47], offset);
      w[46] = hc_bytealign_be (w[45], w[46], offset);
      w[45] = hc_bytealign_be (w[44], w[45], offset);
      w[44] = hc_bytealign_be (w[43], w[44], offset);
      w[43] = hc_bytealign_be (w[42], w[43], offset);
      w[42] = hc_bytealign_be (w[41], w[42], offset);
      w[41] = hc_bytealign_be (w[40], w[41], offset);
      w[40] = hc_bytealign_be (w[39], w[40], offset);
      w[39] = hc_bytealign_be (w[38], w[39], offset);
      w[38] = hc_bytealign_be (w[37], w[38], offset);
      w[37] = hc_bytealign_be (w[36], w[37], offset);
      w[36] = hc_bytealign_be (w[35], w[36], offset);
      w[35] = hc_bytealign_be (w[34], w[35], offset);
      w[34] = hc_bytealign_be (w[33], w[34], offset);
      w[33] = hc_bytealign_be (w[32], w[33], offset);
      w[32] = hc_bytealign_be (w[31], w[32], offset);
      w[31] = hc_bytealign_be (w[30], w[31], offset);
      w[30] = hc_bytealign_be (w[29], w[30], offset);
      w[29] = hc_bytealign_be (w[28], w[29], offset);
      w[28] = hc_bytealign_be (w[27], w[28], offset);
      w[27] = hc_bytealign_be (w[26], w[27], offset);
      w[26] = hc_bytealign_be (w[25], w[26], offset);
      w[25] = hc_bytealign_be (w[24], w[25], offset);
      w[24] = hc_bytealign_be (w[23], w[24], offset);
      w[23] = hc_bytealign_be (w[22], w[23], offset);
      w[22] = hc_bytealign_be (w[21], w[22], offset);
      w[21] = hc_bytealign_be (w[20], w[21], offset);
      w[20] = hc_bytealign_be (w[19], w[20], offset);
      w[19] = hc_bytealign_be (w[18], w[19], offset);
      w[18] = hc_bytealign_be (w[17], w[18], offset);
      w[17] = hc_bytealign_be (w[16], w[17], offset);
      w[16] = hc_bytealign_be (w[15], w[16], offset);
      w[15] = hc_bytealign_be (w[14], w[15], offset);
      w[14] = hc_bytealign_be (w[13], w[14], offset);
      w[13] = hc_bytealign_be (w[12], w[13], offset);
      w[12] = hc_bytealign_be (w[11], w[12], offset);
      w[11] = hc_bytealign_be (w[10], w[11], offset);
      w[10] = hc_bytealign_be (w[ 9], w[10], offset);
      w[ 9] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[ 8] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[ 7] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[ 6] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[ 5] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 4] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 3] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 2] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 1] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 0] = hc_bytealign_be (    0, w[ 0], offset);

      break;

    case  1:
      w[63] = hc_bytealign_be (w[61], w[62], offset);
      w[62] = hc_bytealign_be (w[60], w[61], offset);
      w[61] = hc_bytealign_be (w[59], w[60], offset);
      w[60] = hc_bytealign_be (w[58], w[59], offset);
      w[59] = hc_bytealign_be (w[57], w[58], offset);
      w[58] = hc_bytealign_be (w[56], w[57], offset);
      w[57] = hc_bytealign_be (w[55], w[56], offset);
      w[56] = hc_bytealign_be (w[54], w[55], offset);
      w[55] = hc_bytealign_be (w[53], w[54], offset);
      w[54] = hc_bytealign_be (w[52], w[53], offset);
      w[53] = hc_bytealign_be (w[51], w[52], offset);
      w[52] = hc_bytealign_be (w[50], w[51], offset);
      w[51] = hc_bytealign_be (w[49], w[50], offset);
      w[50] = hc_bytealign_be (w[48], w[49], offset);
      w[49] = hc_bytealign_be (w[47], w[48], offset);
      w[48] = hc_bytealign_be (w[46], w[47], offset);
      w[47] = hc_bytealign_be (w[45], w[46], offset);
      w[46] = hc_bytealign_be (w[44], w[45], offset);
      w[45] = hc_bytealign_be (w[43], w[44], offset);
      w[44] = hc_bytealign_be (w[42], w[43], offset);
      w[43] = hc_bytealign_be (w[41], w[42], offset);
      w[42] = hc_bytealign_be (w[40], w[41], offset);
      w[41] = hc_bytealign_be (w[39], w[40], offset);
      w[40] = hc_bytealign_be (w[38], w[39], offset);
      w[39] = hc_bytealign_be (w[37], w[38], offset);
      w[38] = hc_bytealign_be (w[36], w[37], offset);
      w[37] = hc_bytealign_be (w[35], w[36], offset);
      w[36] = hc_bytealign_be (w[34], w[35], offset);
      w[35] = hc_bytealign_be (w[33], w[34], offset);
      w[34] = hc_bytealign_be (w[32], w[33], offset);
      w[33] = hc_bytealign_be (w[31], w[32], offset);
      w[32] = hc_bytealign_be (w[30], w[31], offset);
      w[31] = hc_bytealign_be (w[29], w[30], offset);
      w[30] = hc_bytealign_be (w[28], w[29], offset);
      w[29] = hc_bytealign_be (w[27], w[28], offset);
      w[28] = hc_bytealign_be (w[26], w[27], offset);
      w[27] = hc_bytealign_be (w[25], w[26], offset);
      w[26] = hc_bytealign_be (w[24], w[25], offset);
      w[25] = hc_bytealign_be (w[23], w[24], offset);
      w[24] = hc_bytealign_be (w[22], w[23], offset);
      w[23] = hc_bytealign_be (w[21], w[22], offset);
      w[22] = hc_bytealign_be (w[20], w[21], offset);
      w[21] = hc_bytealign_be (w[19], w[20], offset);
      w[20] = hc_bytealign_be (w[18], w[19], offset);
      w[19] = hc_bytealign_be (w[17], w[18], offset);
      w[18] = hc_bytealign_be (w[16], w[17], offset);
      w[17] = hc_bytealign_be (w[15], w[16], offset);
      w[16] = hc_bytealign_be (w[14], w[15], offset);
      w[15] = hc_bytealign_be (w[13], w[14], offset);
      w[14] = hc_bytealign_be (w[12], w[13], offset);
      w[13] = hc_bytealign_be (w[11], w[12], offset);
      w[12] = hc_bytealign_be (w[10], w[11], offset);
      w[11] = hc_bytealign_be (w[ 9], w[10], offset);
      w[10] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[ 9] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[ 8] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[ 7] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[ 6] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 5] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 4] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 3] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 2] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 1] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 0] = 0;

      break;

    case  2:
      w[63] = hc_bytealign_be (w[60], w[61], offset);
      w[62] = hc_bytealign_be (w[59], w[60], offset);
      w[61] = hc_bytealign_be (w[58], w[59], offset);
      w[60] = hc_bytealign_be (w[57], w[58], offset);
      w[59] = hc_bytealign_be (w[56], w[57], offset);
      w[58] = hc_bytealign_be (w[55], w[56], offset);
      w[57] = hc_bytealign_be (w[54], w[55], offset);
      w[56] = hc_bytealign_be (w[53], w[54], offset);
      w[55] = hc_bytealign_be (w[52], w[53], offset);
      w[54] = hc_bytealign_be (w[51], w[52], offset);
      w[53] = hc_bytealign_be (w[50], w[51], offset);
      w[52] = hc_bytealign_be (w[49], w[50], offset);
      w[51] = hc_bytealign_be (w[48], w[49], offset);
      w[50] = hc_bytealign_be (w[47], w[48], offset);
      w[49] = hc_bytealign_be (w[46], w[47], offset);
      w[48] = hc_bytealign_be (w[45], w[46], offset);
      w[47] = hc_bytealign_be (w[44], w[45], offset);
      w[46] = hc_bytealign_be (w[43], w[44], offset);
      w[45] = hc_bytealign_be (w[42], w[43], offset);
      w[44] = hc_bytealign_be (w[41], w[42], offset);
      w[43] = hc_bytealign_be (w[40], w[41], offset);
      w[42] = hc_bytealign_be (w[39], w[40], offset);
      w[41] = hc_bytealign_be (w[38], w[39], offset);
      w[40] = hc_bytealign_be (w[37], w[38], offset);
      w[39] = hc_bytealign_be (w[36], w[37], offset);
      w[38] = hc_bytealign_be (w[35], w[36], offset);
      w[37] = hc_bytealign_be (w[34], w[35], offset);
      w[36] = hc_bytealign_be (w[33], w[34], offset);
      w[35] = hc_bytealign_be (w[32], w[33], offset);
      w[34] = hc_bytealign_be (w[31], w[32], offset);
      w[33] = hc_bytealign_be (w[30], w[31], offset);
      w[32] = hc_bytealign_be (w[29], w[30], offset);
      w[31] = hc_bytealign_be (w[28], w[29], offset);
      w[30] = hc_bytealign_be (w[27], w[28], offset);
      w[29] = hc_bytealign_be (w[26], w[27], offset);
      w[28] = hc_bytealign_be (w[25], w[26], offset);
      w[27] = hc_bytealign_be (w[24], w[25], offset);
      w[26] = hc_bytealign_be (w[23], w[24], offset);
      w[25] = hc_bytealign_be (w[22], w[23], offset);
      w[24] = hc_bytealign_be (w[21], w[22], offset);
      w[23] = hc_bytealign_be (w[20], w[21], offset);
      w[22] = hc_bytealign_be (w[19], w[20], offset);
      w[21] = hc_bytealign_be (w[18], w[19], offset);
      w[20] = hc_bytealign_be (w[17], w[18], offset);
      w[19] = hc_bytealign_be (w[16], w[17], offset);
      w[18] = hc_bytealign_be (w[15], w[16], offset);
      w[17] = hc_bytealign_be (w[14], w[15], offset);
      w[16] = hc_bytealign_be (w[13], w[14], offset);
      w[15] = hc_bytealign_be (w[12], w[13], offset);
      w[14] = hc_bytealign_be (w[11], w[12], offset);
      w[13] = hc_bytealign_be (w[10], w[11], offset);
      w[12] = hc_bytealign_be (w[ 9], w[10], offset);
      w[11] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[10] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[ 9] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[ 8] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[ 7] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 6] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 5] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 4] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 3] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 2] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  3:
      w[63] = hc_bytealign_be (w[59], w[60], offset);
      w[62] = hc_bytealign_be (w[58], w[59], offset);
      w[61] = hc_bytealign_be (w[57], w[58], offset);
      w[60] = hc_bytealign_be (w[56], w[57], offset);
      w[59] = hc_bytealign_be (w[55], w[56], offset);
      w[58] = hc_bytealign_be (w[54], w[55], offset);
      w[57] = hc_bytealign_be (w[53], w[54], offset);
      w[56] = hc_bytealign_be (w[52], w[53], offset);
      w[55] = hc_bytealign_be (w[51], w[52], offset);
      w[54] = hc_bytealign_be (w[50], w[51], offset);
      w[53] = hc_bytealign_be (w[49], w[50], offset);
      w[52] = hc_bytealign_be (w[48], w[49], offset);
      w[51] = hc_bytealign_be (w[47], w[48], offset);
      w[50] = hc_bytealign_be (w[46], w[47], offset);
      w[49] = hc_bytealign_be (w[45], w[46], offset);
      w[48] = hc_bytealign_be (w[44], w[45], offset);
      w[47] = hc_bytealign_be (w[43], w[44], offset);
      w[46] = hc_bytealign_be (w[42], w[43], offset);
      w[45] = hc_bytealign_be (w[41], w[42], offset);
      w[44] = hc_bytealign_be (w[40], w[41], offset);
      w[43] = hc_bytealign_be (w[39], w[40], offset);
      w[42] = hc_bytealign_be (w[38], w[39], offset);
      w[41] = hc_bytealign_be (w[37], w[38], offset);
      w[40] = hc_bytealign_be (w[36], w[37], offset);
      w[39] = hc_bytealign_be (w[35], w[36], offset);
      w[38] = hc_bytealign_be (w[34], w[35], offset);
      w[37] = hc_bytealign_be (w[33], w[34], offset);
      w[36] = hc_bytealign_be (w[32], w[33], offset);
      w[35] = hc_bytealign_be (w[31], w[32], offset);
      w[34] = hc_bytealign_be (w[30], w[31], offset);
      w[33] = hc_bytealign_be (w[29], w[30], offset);
      w[32] = hc_bytealign_be (w[28], w[29], offset);
      w[31] = hc_bytealign_be (w[27], w[28], offset);
      w[30] = hc_bytealign_be (w[26], w[27], offset);
      w[29] = hc_bytealign_be (w[25], w[26], offset);
      w[28] = hc_bytealign_be (w[24], w[25], offset);
      w[27] = hc_bytealign_be (w[23], w[24], offset);
      w[26] = hc_bytealign_be (w[22], w[23], offset);
      w[25] = hc_bytealign_be (w[21], w[22], offset);
      w[24] = hc_bytealign_be (w[20], w[21], offset);
      w[23] = hc_bytealign_be (w[19], w[20], offset);
      w[22] = hc_bytealign_be (w[18], w[19], offset);
      w[21] = hc_bytealign_be (w[17], w[18], offset);
      w[20] = hc_bytealign_be (w[16], w[17], offset);
      w[19] = hc_bytealign_be (w[15], w[16], offset);
      w[18] = hc_bytealign_be (w[14], w[15], offset);
      w[17] = hc_bytealign_be (w[13], w[14], offset);
      w[16] = hc_bytealign_be (w[12], w[13], offset);
      w[15] = hc_bytealign_be (w[11], w[12], offset);
      w[14] = hc_bytealign_be (w[10], w[11], offset);
      w[13] = hc_bytealign_be (w[ 9], w[10], offset);
      w[12] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[11] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[10] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[ 9] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[ 8] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 7] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 6] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 5] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 4] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 3] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  4:
      w[63] = hc_bytealign_be (w[58], w[59], offset);
      w[62] = hc_bytealign_be (w[57], w[58], offset);
      w[61] = hc_bytealign_be (w[56], w[57], offset);
      w[60] = hc_bytealign_be (w[55], w[56], offset);
      w[59] = hc_bytealign_be (w[54], w[55], offset);
      w[58] = hc_bytealign_be (w[53], w[54], offset);
      w[57] = hc_bytealign_be (w[52], w[53], offset);
      w[56] = hc_bytealign_be (w[51], w[52], offset);
      w[55] = hc_bytealign_be (w[50], w[51], offset);
      w[54] = hc_bytealign_be (w[49], w[50], offset);
      w[53] = hc_bytealign_be (w[48], w[49], offset);
      w[52] = hc_bytealign_be (w[47], w[48], offset);
      w[51] = hc_bytealign_be (w[46], w[47], offset);
      w[50] = hc_bytealign_be (w[45], w[46], offset);
      w[49] = hc_bytealign_be (w[44], w[45], offset);
      w[48] = hc_bytealign_be (w[43], w[44], offset);
      w[47] = hc_bytealign_be (w[42], w[43], offset);
      w[46] = hc_bytealign_be (w[41], w[42], offset);
      w[45] = hc_bytealign_be (w[40], w[41], offset);
      w[44] = hc_bytealign_be (w[39], w[40], offset);
      w[43] = hc_bytealign_be (w[38], w[39], offset);
      w[42] = hc_bytealign_be (w[37], w[38], offset);
      w[41] = hc_bytealign_be (w[36], w[37], offset);
      w[40] = hc_bytealign_be (w[35], w[36], offset);
      w[39] = hc_bytealign_be (w[34], w[35], offset);
      w[38] = hc_bytealign_be (w[33], w[34], offset);
      w[37] = hc_bytealign_be (w[32], w[33], offset);
      w[36] = hc_bytealign_be (w[31], w[32], offset);
      w[35] = hc_bytealign_be (w[30], w[31], offset);
      w[34] = hc_bytealign_be (w[29], w[30], offset);
      w[33] = hc_bytealign_be (w[28], w[29], offset);
      w[32] = hc_bytealign_be (w[27], w[28], offset);
      w[31] = hc_bytealign_be (w[26], w[27], offset);
      w[30] = hc_bytealign_be (w[25], w[26], offset);
      w[29] = hc_bytealign_be (w[24], w[25], offset);
      w[28] = hc_bytealign_be (w[23], w[24], offset);
      w[27] = hc_bytealign_be (w[22], w[23], offset);
      w[26] = hc_bytealign_be (w[21], w[22], offset);
      w[25] = hc_bytealign_be (w[20], w[21], offset);
      w[24] = hc_bytealign_be (w[19], w[20], offset);
      w[23] = hc_bytealign_be (w[18], w[19], offset);
      w[22] = hc_bytealign_be (w[17], w[18], offset);
      w[21] = hc_bytealign_be (w[16], w[17], offset);
      w[20] = hc_bytealign_be (w[15], w[16], offset);
      w[19] = hc_bytealign_be (w[14], w[15], offset);
      w[18] = hc_bytealign_be (w[13], w[14], offset);
      w[17] = hc_bytealign_be (w[12], w[13], offset);
      w[16] = hc_bytealign_be (w[11], w[12], offset);
      w[15] = hc_bytealign_be (w[10], w[11], offset);
      w[14] = hc_bytealign_be (w[ 9], w[10], offset);
      w[13] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[12] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[11] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[10] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[ 9] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 8] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 7] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 6] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 5] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 4] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  5:
      w[63] = hc_bytealign_be (w[57], w[58], offset);
      w[62] = hc_bytealign_be (w[56], w[57], offset);
      w[61] = hc_bytealign_be (w[55], w[56], offset);
      w[60] = hc_bytealign_be (w[54], w[55], offset);
      w[59] = hc_bytealign_be (w[53], w[54], offset);
      w[58] = hc_bytealign_be (w[52], w[53], offset);
      w[57] = hc_bytealign_be (w[51], w[52], offset);
      w[56] = hc_bytealign_be (w[50], w[51], offset);
      w[55] = hc_bytealign_be (w[49], w[50], offset);
      w[54] = hc_bytealign_be (w[48], w[49], offset);
      w[53] = hc_bytealign_be (w[47], w[48], offset);
      w[52] = hc_bytealign_be (w[46], w[47], offset);
      w[51] = hc_bytealign_be (w[45], w[46], offset);
      w[50] = hc_bytealign_be (w[44], w[45], offset);
      w[49] = hc_bytealign_be (w[43], w[44], offset);
      w[48] = hc_bytealign_be (w[42], w[43], offset);
      w[47] = hc_bytealign_be (w[41], w[42], offset);
      w[46] = hc_bytealign_be (w[40], w[41], offset);
      w[45] = hc_bytealign_be (w[39], w[40], offset);
      w[44] = hc_bytealign_be (w[38], w[39], offset);
      w[43] = hc_bytealign_be (w[37], w[38], offset);
      w[42] = hc_bytealign_be (w[36], w[37], offset);
      w[41] = hc_bytealign_be (w[35], w[36], offset);
      w[40] = hc_bytealign_be (w[34], w[35], offset);
      w[39] = hc_bytealign_be (w[33], w[34], offset);
      w[38] = hc_bytealign_be (w[32], w[33], offset);
      w[37] = hc_bytealign_be (w[31], w[32], offset);
      w[36] = hc_bytealign_be (w[30], w[31], offset);
      w[35] = hc_bytealign_be (w[29], w[30], offset);
      w[34] = hc_bytealign_be (w[28], w[29], offset);
      w[33] = hc_bytealign_be (w[27], w[28], offset);
      w[32] = hc_bytealign_be (w[26], w[27], offset);
      w[31] = hc_bytealign_be (w[25], w[26], offset);
      w[30] = hc_bytealign_be (w[24], w[25], offset);
      w[29] = hc_bytealign_be (w[23], w[24], offset);
      w[28] = hc_bytealign_be (w[22], w[23], offset);
      w[27] = hc_bytealign_be (w[21], w[22], offset);
      w[26] = hc_bytealign_be (w[20], w[21], offset);
      w[25] = hc_bytealign_be (w[19], w[20], offset);
      w[24] = hc_bytealign_be (w[18], w[19], offset);
      w[23] = hc_bytealign_be (w[17], w[18], offset);
      w[22] = hc_bytealign_be (w[16], w[17], offset);
      w[21] = hc_bytealign_be (w[15], w[16], offset);
      w[20] = hc_bytealign_be (w[14], w[15], offset);
      w[19] = hc_bytealign_be (w[13], w[14], offset);
      w[18] = hc_bytealign_be (w[12], w[13], offset);
      w[17] = hc_bytealign_be (w[11], w[12], offset);
      w[16] = hc_bytealign_be (w[10], w[11], offset);
      w[15] = hc_bytealign_be (w[ 9], w[10], offset);
      w[14] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[13] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[12] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[11] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[10] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[ 9] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 8] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 7] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 6] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 5] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  6:
      w[63] = hc_bytealign_be (w[56], w[57], offset);
      w[62] = hc_bytealign_be (w[55], w[56], offset);
      w[61] = hc_bytealign_be (w[54], w[55], offset);
      w[60] = hc_bytealign_be (w[53], w[54], offset);
      w[59] = hc_bytealign_be (w[52], w[53], offset);
      w[58] = hc_bytealign_be (w[51], w[52], offset);
      w[57] = hc_bytealign_be (w[50], w[51], offset);
      w[56] = hc_bytealign_be (w[49], w[50], offset);
      w[55] = hc_bytealign_be (w[48], w[49], offset);
      w[54] = hc_bytealign_be (w[47], w[48], offset);
      w[53] = hc_bytealign_be (w[46], w[47], offset);
      w[52] = hc_bytealign_be (w[45], w[46], offset);
      w[51] = hc_bytealign_be (w[44], w[45], offset);
      w[50] = hc_bytealign_be (w[43], w[44], offset);
      w[49] = hc_bytealign_be (w[42], w[43], offset);
      w[48] = hc_bytealign_be (w[41], w[42], offset);
      w[47] = hc_bytealign_be (w[40], w[41], offset);
      w[46] = hc_bytealign_be (w[39], w[40], offset);
      w[45] = hc_bytealign_be (w[38], w[39], offset);
      w[44] = hc_bytealign_be (w[37], w[38], offset);
      w[43] = hc_bytealign_be (w[36], w[37], offset);
      w[42] = hc_bytealign_be (w[35], w[36], offset);
      w[41] = hc_bytealign_be (w[34], w[35], offset);
      w[40] = hc_bytealign_be (w[33], w[34], offset);
      w[39] = hc_bytealign_be (w[32], w[33], offset);
      w[38] = hc_bytealign_be (w[31], w[32], offset);
      w[37] = hc_bytealign_be (w[30], w[31], offset);
      w[36] = hc_bytealign_be (w[29], w[30], offset);
      w[35] = hc_bytealign_be (w[28], w[29], offset);
      w[34] = hc_bytealign_be (w[27], w[28], offset);
      w[33] = hc_bytealign_be (w[26], w[27], offset);
      w[32] = hc_bytealign_be (w[25], w[26], offset);
      w[31] = hc_bytealign_be (w[24], w[25], offset);
      w[30] = hc_bytealign_be (w[23], w[24], offset);
      w[29] = hc_bytealign_be (w[22], w[23], offset);
      w[28] = hc_bytealign_be (w[21], w[22], offset);
      w[27] = hc_bytealign_be (w[20], w[21], offset);
      w[26] = hc_bytealign_be (w[19], w[20], offset);
      w[25] = hc_bytealign_be (w[18], w[19], offset);
      w[24] = hc_bytealign_be (w[17], w[18], offset);
      w[23] = hc_bytealign_be (w[16], w[17], offset);
      w[22] = hc_bytealign_be (w[15], w[16], offset);
      w[21] = hc_bytealign_be (w[14], w[15], offset);
      w[20] = hc_bytealign_be (w[13], w[14], offset);
      w[19] = hc_bytealign_be (w[12], w[13], offset);
      w[18] = hc_bytealign_be (w[11], w[12], offset);
      w[17] = hc_bytealign_be (w[10], w[11], offset);
      w[16] = hc_bytealign_be (w[ 9], w[10], offset);
      w[15] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[14] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[13] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[12] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[11] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[10] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[ 9] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 8] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 7] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 6] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  7:
      w[63] = hc_bytealign_be (w[55], w[56], offset);
      w[62] = hc_bytealign_be (w[54], w[55], offset);
      w[61] = hc_bytealign_be (w[53], w[54], offset);
      w[60] = hc_bytealign_be (w[52], w[53], offset);
      w[59] = hc_bytealign_be (w[51], w[52], offset);
      w[58] = hc_bytealign_be (w[50], w[51], offset);
      w[57] = hc_bytealign_be (w[49], w[50], offset);
      w[56] = hc_bytealign_be (w[48], w[49], offset);
      w[55] = hc_bytealign_be (w[47], w[48], offset);
      w[54] = hc_bytealign_be (w[46], w[47], offset);
      w[53] = hc_bytealign_be (w[45], w[46], offset);
      w[52] = hc_bytealign_be (w[44], w[45], offset);
      w[51] = hc_bytealign_be (w[43], w[44], offset);
      w[50] = hc_bytealign_be (w[42], w[43], offset);
      w[49] = hc_bytealign_be (w[41], w[42], offset);
      w[48] = hc_bytealign_be (w[40], w[41], offset);
      w[47] = hc_bytealign_be (w[39], w[40], offset);
      w[46] = hc_bytealign_be (w[38], w[39], offset);
      w[45] = hc_bytealign_be (w[37], w[38], offset);
      w[44] = hc_bytealign_be (w[36], w[37], offset);
      w[43] = hc_bytealign_be (w[35], w[36], offset);
      w[42] = hc_bytealign_be (w[34], w[35], offset);
      w[41] = hc_bytealign_be (w[33], w[34], offset);
      w[40] = hc_bytealign_be (w[32], w[33], offset);
      w[39] = hc_bytealign_be (w[31], w[32], offset);
      w[38] = hc_bytealign_be (w[30], w[31], offset);
      w[37] = hc_bytealign_be (w[29], w[30], offset);
      w[36] = hc_bytealign_be (w[28], w[29], offset);
      w[35] = hc_bytealign_be (w[27], w[28], offset);
      w[34] = hc_bytealign_be (w[26], w[27], offset);
      w[33] = hc_bytealign_be (w[25], w[26], offset);
      w[32] = hc_bytealign_be (w[24], w[25], offset);
      w[31] = hc_bytealign_be (w[23], w[24], offset);
      w[30] = hc_bytealign_be (w[22], w[23], offset);
      w[29] = hc_bytealign_be (w[21], w[22], offset);
      w[28] = hc_bytealign_be (w[20], w[21], offset);
      w[27] = hc_bytealign_be (w[19], w[20], offset);
      w[26] = hc_bytealign_be (w[18], w[19], offset);
      w[25] = hc_bytealign_be (w[17], w[18], offset);
      w[24] = hc_bytealign_be (w[16], w[17], offset);
      w[23] = hc_bytealign_be (w[15], w[16], offset);
      w[22] = hc_bytealign_be (w[14], w[15], offset);
      w[21] = hc_bytealign_be (w[13], w[14], offset);
      w[20] = hc_bytealign_be (w[12], w[13], offset);
      w[19] = hc_bytealign_be (w[11], w[12], offset);
      w[18] = hc_bytealign_be (w[10], w[11], offset);
      w[17] = hc_bytealign_be (w[ 9], w[10], offset);
      w[16] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[15] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[14] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[13] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[12] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[11] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[10] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[ 9] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 8] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 7] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  8:
      w[63] = hc_bytealign_be (w[54], w[55], offset);
      w[62] = hc_bytealign_be (w[53], w[54], offset);
      w[61] = hc_bytealign_be (w[52], w[53], offset);
      w[60] = hc_bytealign_be (w[51], w[52], offset);
      w[59] = hc_bytealign_be (w[50], w[51], offset);
      w[58] = hc_bytealign_be (w[49], w[50], offset);
      w[57] = hc_bytealign_be (w[48], w[49], offset);
      w[56] = hc_bytealign_be (w[47], w[48], offset);
      w[55] = hc_bytealign_be (w[46], w[47], offset);
      w[54] = hc_bytealign_be (w[45], w[46], offset);
      w[53] = hc_bytealign_be (w[44], w[45], offset);
      w[52] = hc_bytealign_be (w[43], w[44], offset);
      w[51] = hc_bytealign_be (w[42], w[43], offset);
      w[50] = hc_bytealign_be (w[41], w[42], offset);
      w[49] = hc_bytealign_be (w[40], w[41], offset);
      w[48] = hc_bytealign_be (w[39], w[40], offset);
      w[47] = hc_bytealign_be (w[38], w[39], offset);
      w[46] = hc_bytealign_be (w[37], w[38], offset);
      w[45] = hc_bytealign_be (w[36], w[37], offset);
      w[44] = hc_bytealign_be (w[35], w[36], offset);
      w[43] = hc_bytealign_be (w[34], w[35], offset);
      w[42] = hc_bytealign_be (w[33], w[34], offset);
      w[41] = hc_bytealign_be (w[32], w[33], offset);
      w[40] = hc_bytealign_be (w[31], w[32], offset);
      w[39] = hc_bytealign_be (w[30], w[31], offset);
      w[38] = hc_bytealign_be (w[29], w[30], offset);
      w[37] = hc_bytealign_be (w[28], w[29], offset);
      w[36] = hc_bytealign_be (w[27], w[28], offset);
      w[35] = hc_bytealign_be (w[26], w[27], offset);
      w[34] = hc_bytealign_be (w[25], w[26], offset);
      w[33] = hc_bytealign_be (w[24], w[25], offset);
      w[32] = hc_bytealign_be (w[23], w[24], offset);
      w[31] = hc_bytealign_be (w[22], w[23], offset);
      w[30] = hc_bytealign_be (w[21], w[22], offset);
      w[29] = hc_bytealign_be (w[20], w[21], offset);
      w[28] = hc_bytealign_be (w[19], w[20], offset);
      w[27] = hc_bytealign_be (w[18], w[19], offset);
      w[26] = hc_bytealign_be (w[17], w[18], offset);
      w[25] = hc_bytealign_be (w[16], w[17], offset);
      w[24] = hc_bytealign_be (w[15], w[16], offset);
      w[23] = hc_bytealign_be (w[14], w[15], offset);
      w[22] = hc_bytealign_be (w[13], w[14], offset);
      w[21] = hc_bytealign_be (w[12], w[13], offset);
      w[20] = hc_bytealign_be (w[11], w[12], offset);
      w[19] = hc_bytealign_be (w[10], w[11], offset);
      w[18] = hc_bytealign_be (w[ 9], w[10], offset);
      w[17] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[16] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[15] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[14] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[13] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[12] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[11] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[10] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[ 9] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 8] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case  9:
      w[63] = hc_bytealign_be (w[53], w[54], offset);
      w[62] = hc_bytealign_be (w[52], w[53], offset);
      w[61] = hc_bytealign_be (w[51], w[52], offset);
      w[60] = hc_bytealign_be (w[50], w[51], offset);
      w[59] = hc_bytealign_be (w[49], w[50], offset);
      w[58] = hc_bytealign_be (w[48], w[49], offset);
      w[57] = hc_bytealign_be (w[47], w[48], offset);
      w[56] = hc_bytealign_be (w[46], w[47], offset);
      w[55] = hc_bytealign_be (w[45], w[46], offset);
      w[54] = hc_bytealign_be (w[44], w[45], offset);
      w[53] = hc_bytealign_be (w[43], w[44], offset);
      w[52] = hc_bytealign_be (w[42], w[43], offset);
      w[51] = hc_bytealign_be (w[41], w[42], offset);
      w[50] = hc_bytealign_be (w[40], w[41], offset);
      w[49] = hc_bytealign_be (w[39], w[40], offset);
      w[48] = hc_bytealign_be (w[38], w[39], offset);
      w[47] = hc_bytealign_be (w[37], w[38], offset);
      w[46] = hc_bytealign_be (w[36], w[37], offset);
      w[45] = hc_bytealign_be (w[35], w[36], offset);
      w[44] = hc_bytealign_be (w[34], w[35], offset);
      w[43] = hc_bytealign_be (w[33], w[34], offset);
      w[42] = hc_bytealign_be (w[32], w[33], offset);
      w[41] = hc_bytealign_be (w[31], w[32], offset);
      w[40] = hc_bytealign_be (w[30], w[31], offset);
      w[39] = hc_bytealign_be (w[29], w[30], offset);
      w[38] = hc_bytealign_be (w[28], w[29], offset);
      w[37] = hc_bytealign_be (w[27], w[28], offset);
      w[36] = hc_bytealign_be (w[26], w[27], offset);
      w[35] = hc_bytealign_be (w[25], w[26], offset);
      w[34] = hc_bytealign_be (w[24], w[25], offset);
      w[33] = hc_bytealign_be (w[23], w[24], offset);
      w[32] = hc_bytealign_be (w[22], w[23], offset);
      w[31] = hc_bytealign_be (w[21], w[22], offset);
      w[30] = hc_bytealign_be (w[20], w[21], offset);
      w[29] = hc_bytealign_be (w[19], w[20], offset);
      w[28] = hc_bytealign_be (w[18], w[19], offset);
      w[27] = hc_bytealign_be (w[17], w[18], offset);
      w[26] = hc_bytealign_be (w[16], w[17], offset);
      w[25] = hc_bytealign_be (w[15], w[16], offset);
      w[24] = hc_bytealign_be (w[14], w[15], offset);
      w[23] = hc_bytealign_be (w[13], w[14], offset);
      w[22] = hc_bytealign_be (w[12], w[13], offset);
      w[21] = hc_bytealign_be (w[11], w[12], offset);
      w[20] = hc_bytealign_be (w[10], w[11], offset);
      w[19] = hc_bytealign_be (w[ 9], w[10], offset);
      w[18] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[17] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[16] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[15] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[14] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[13] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[12] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[11] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[10] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[ 9] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 10:
      w[63] = hc_bytealign_be (w[52], w[53], offset);
      w[62] = hc_bytealign_be (w[51], w[52], offset);
      w[61] = hc_bytealign_be (w[50], w[51], offset);
      w[60] = hc_bytealign_be (w[49], w[50], offset);
      w[59] = hc_bytealign_be (w[48], w[49], offset);
      w[58] = hc_bytealign_be (w[47], w[48], offset);
      w[57] = hc_bytealign_be (w[46], w[47], offset);
      w[56] = hc_bytealign_be (w[45], w[46], offset);
      w[55] = hc_bytealign_be (w[44], w[45], offset);
      w[54] = hc_bytealign_be (w[43], w[44], offset);
      w[53] = hc_bytealign_be (w[42], w[43], offset);
      w[52] = hc_bytealign_be (w[41], w[42], offset);
      w[51] = hc_bytealign_be (w[40], w[41], offset);
      w[50] = hc_bytealign_be (w[39], w[40], offset);
      w[49] = hc_bytealign_be (w[38], w[39], offset);
      w[48] = hc_bytealign_be (w[37], w[38], offset);
      w[47] = hc_bytealign_be (w[36], w[37], offset);
      w[46] = hc_bytealign_be (w[35], w[36], offset);
      w[45] = hc_bytealign_be (w[34], w[35], offset);
      w[44] = hc_bytealign_be (w[33], w[34], offset);
      w[43] = hc_bytealign_be (w[32], w[33], offset);
      w[42] = hc_bytealign_be (w[31], w[32], offset);
      w[41] = hc_bytealign_be (w[30], w[31], offset);
      w[40] = hc_bytealign_be (w[29], w[30], offset);
      w[39] = hc_bytealign_be (w[28], w[29], offset);
      w[38] = hc_bytealign_be (w[27], w[28], offset);
      w[37] = hc_bytealign_be (w[26], w[27], offset);
      w[36] = hc_bytealign_be (w[25], w[26], offset);
      w[35] = hc_bytealign_be (w[24], w[25], offset);
      w[34] = hc_bytealign_be (w[23], w[24], offset);
      w[33] = hc_bytealign_be (w[22], w[23], offset);
      w[32] = hc_bytealign_be (w[21], w[22], offset);
      w[31] = hc_bytealign_be (w[20], w[21], offset);
      w[30] = hc_bytealign_be (w[19], w[20], offset);
      w[29] = hc_bytealign_be (w[18], w[19], offset);
      w[28] = hc_bytealign_be (w[17], w[18], offset);
      w[27] = hc_bytealign_be (w[16], w[17], offset);
      w[26] = hc_bytealign_be (w[15], w[16], offset);
      w[25] = hc_bytealign_be (w[14], w[15], offset);
      w[24] = hc_bytealign_be (w[13], w[14], offset);
      w[23] = hc_bytealign_be (w[12], w[13], offset);
      w[22] = hc_bytealign_be (w[11], w[12], offset);
      w[21] = hc_bytealign_be (w[10], w[11], offset);
      w[20] = hc_bytealign_be (w[ 9], w[10], offset);
      w[19] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[18] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[17] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[16] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[15] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[14] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[13] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[12] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[11] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[10] = hc_bytealign_be (    0, w[ 0], offset);
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 11:
      w[63] = hc_bytealign_be (w[51], w[52], offset);
      w[62] = hc_bytealign_be (w[50], w[51], offset);
      w[61] = hc_bytealign_be (w[49], w[50], offset);
      w[60] = hc_bytealign_be (w[48], w[49], offset);
      w[59] = hc_bytealign_be (w[47], w[48], offset);
      w[58] = hc_bytealign_be (w[46], w[47], offset);
      w[57] = hc_bytealign_be (w[45], w[46], offset);
      w[56] = hc_bytealign_be (w[44], w[45], offset);
      w[55] = hc_bytealign_be (w[43], w[44], offset);
      w[54] = hc_bytealign_be (w[42], w[43], offset);
      w[53] = hc_bytealign_be (w[41], w[42], offset);
      w[52] = hc_bytealign_be (w[40], w[41], offset);
      w[51] = hc_bytealign_be (w[39], w[40], offset);
      w[50] = hc_bytealign_be (w[38], w[39], offset);
      w[49] = hc_bytealign_be (w[37], w[38], offset);
      w[48] = hc_bytealign_be (w[36], w[37], offset);
      w[47] = hc_bytealign_be (w[35], w[36], offset);
      w[46] = hc_bytealign_be (w[34], w[35], offset);
      w[45] = hc_bytealign_be (w[33], w[34], offset);
      w[44] = hc_bytealign_be (w[32], w[33], offset);
      w[43] = hc_bytealign_be (w[31], w[32], offset);
      w[42] = hc_bytealign_be (w[30], w[31], offset);
      w[41] = hc_bytealign_be (w[29], w[30], offset);
      w[40] = hc_bytealign_be (w[28], w[29], offset);
      w[39] = hc_bytealign_be (w[27], w[28], offset);
      w[38] = hc_bytealign_be (w[26], w[27], offset);
      w[37] = hc_bytealign_be (w[25], w[26], offset);
      w[36] = hc_bytealign_be (w[24], w[25], offset);
      w[35] = hc_bytealign_be (w[23], w[24], offset);
      w[34] = hc_bytealign_be (w[22], w[23], offset);
      w[33] = hc_bytealign_be (w[21], w[22], offset);
      w[32] = hc_bytealign_be (w[20], w[21], offset);
      w[31] = hc_bytealign_be (w[19], w[20], offset);
      w[30] = hc_bytealign_be (w[18], w[19], offset);
      w[29] = hc_bytealign_be (w[17], w[18], offset);
      w[28] = hc_bytealign_be (w[16], w[17], offset);
      w[27] = hc_bytealign_be (w[15], w[16], offset);
      w[26] = hc_bytealign_be (w[14], w[15], offset);
      w[25] = hc_bytealign_be (w[13], w[14], offset);
      w[24] = hc_bytealign_be (w[12], w[13], offset);
      w[23] = hc_bytealign_be (w[11], w[12], offset);
      w[22] = hc_bytealign_be (w[10], w[11], offset);
      w[21] = hc_bytealign_be (w[ 9], w[10], offset);
      w[20] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[19] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[18] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[17] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[16] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[15] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[14] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[13] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[12] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[11] = hc_bytealign_be (    0, w[ 0], offset);
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 12:
      w[63] = hc_bytealign_be (w[50], w[51], offset);
      w[62] = hc_bytealign_be (w[49], w[50], offset);
      w[61] = hc_bytealign_be (w[48], w[49], offset);
      w[60] = hc_bytealign_be (w[47], w[48], offset);
      w[59] = hc_bytealign_be (w[46], w[47], offset);
      w[58] = hc_bytealign_be (w[45], w[46], offset);
      w[57] = hc_bytealign_be (w[44], w[45], offset);
      w[56] = hc_bytealign_be (w[43], w[44], offset);
      w[55] = hc_bytealign_be (w[42], w[43], offset);
      w[54] = hc_bytealign_be (w[41], w[42], offset);
      w[53] = hc_bytealign_be (w[40], w[41], offset);
      w[52] = hc_bytealign_be (w[39], w[40], offset);
      w[51] = hc_bytealign_be (w[38], w[39], offset);
      w[50] = hc_bytealign_be (w[37], w[38], offset);
      w[49] = hc_bytealign_be (w[36], w[37], offset);
      w[48] = hc_bytealign_be (w[35], w[36], offset);
      w[47] = hc_bytealign_be (w[34], w[35], offset);
      w[46] = hc_bytealign_be (w[33], w[34], offset);
      w[45] = hc_bytealign_be (w[32], w[33], offset);
      w[44] = hc_bytealign_be (w[31], w[32], offset);
      w[43] = hc_bytealign_be (w[30], w[31], offset);
      w[42] = hc_bytealign_be (w[29], w[30], offset);
      w[41] = hc_bytealign_be (w[28], w[29], offset);
      w[40] = hc_bytealign_be (w[27], w[28], offset);
      w[39] = hc_bytealign_be (w[26], w[27], offset);
      w[38] = hc_bytealign_be (w[25], w[26], offset);
      w[37] = hc_bytealign_be (w[24], w[25], offset);
      w[36] = hc_bytealign_be (w[23], w[24], offset);
      w[35] = hc_bytealign_be (w[22], w[23], offset);
      w[34] = hc_bytealign_be (w[21], w[22], offset);
      w[33] = hc_bytealign_be (w[20], w[21], offset);
      w[32] = hc_bytealign_be (w[19], w[20], offset);
      w[31] = hc_bytealign_be (w[18], w[19], offset);
      w[30] = hc_bytealign_be (w[17], w[18], offset);
      w[29] = hc_bytealign_be (w[16], w[17], offset);
      w[28] = hc_bytealign_be (w[15], w[16], offset);
      w[27] = hc_bytealign_be (w[14], w[15], offset);
      w[26] = hc_bytealign_be (w[13], w[14], offset);
      w[25] = hc_bytealign_be (w[12], w[13], offset);
      w[24] = hc_bytealign_be (w[11], w[12], offset);
      w[23] = hc_bytealign_be (w[10], w[11], offset);
      w[22] = hc_bytealign_be (w[ 9], w[10], offset);
      w[21] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[20] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[19] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[18] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[17] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[16] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[15] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[14] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[13] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[12] = hc_bytealign_be (    0, w[ 0], offset);
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 13:
      w[63] = hc_bytealign_be (w[49], w[50], offset);
      w[62] = hc_bytealign_be (w[48], w[49], offset);
      w[61] = hc_bytealign_be (w[47], w[48], offset);
      w[60] = hc_bytealign_be (w[46], w[47], offset);
      w[59] = hc_bytealign_be (w[45], w[46], offset);
      w[58] = hc_bytealign_be (w[44], w[45], offset);
      w[57] = hc_bytealign_be (w[43], w[44], offset);
      w[56] = hc_bytealign_be (w[42], w[43], offset);
      w[55] = hc_bytealign_be (w[41], w[42], offset);
      w[54] = hc_bytealign_be (w[40], w[41], offset);
      w[53] = hc_bytealign_be (w[39], w[40], offset);
      w[52] = hc_bytealign_be (w[38], w[39], offset);
      w[51] = hc_bytealign_be (w[37], w[38], offset);
      w[50] = hc_bytealign_be (w[36], w[37], offset);
      w[49] = hc_bytealign_be (w[35], w[36], offset);
      w[48] = hc_bytealign_be (w[34], w[35], offset);
      w[47] = hc_bytealign_be (w[33], w[34], offset);
      w[46] = hc_bytealign_be (w[32], w[33], offset);
      w[45] = hc_bytealign_be (w[31], w[32], offset);
      w[44] = hc_bytealign_be (w[30], w[31], offset);
      w[43] = hc_bytealign_be (w[29], w[30], offset);
      w[42] = hc_bytealign_be (w[28], w[29], offset);
      w[41] = hc_bytealign_be (w[27], w[28], offset);
      w[40] = hc_bytealign_be (w[26], w[27], offset);
      w[39] = hc_bytealign_be (w[25], w[26], offset);
      w[38] = hc_bytealign_be (w[24], w[25], offset);
      w[37] = hc_bytealign_be (w[23], w[24], offset);
      w[36] = hc_bytealign_be (w[22], w[23], offset);
      w[35] = hc_bytealign_be (w[21], w[22], offset);
      w[34] = hc_bytealign_be (w[20], w[21], offset);
      w[33] = hc_bytealign_be (w[19], w[20], offset);
      w[32] = hc_bytealign_be (w[18], w[19], offset);
      w[31] = hc_bytealign_be (w[17], w[18], offset);
      w[30] = hc_bytealign_be (w[16], w[17], offset);
      w[29] = hc_bytealign_be (w[15], w[16], offset);
      w[28] = hc_bytealign_be (w[14], w[15], offset);
      w[27] = hc_bytealign_be (w[13], w[14], offset);
      w[26] = hc_bytealign_be (w[12], w[13], offset);
      w[25] = hc_bytealign_be (w[11], w[12], offset);
      w[24] = hc_bytealign_be (w[10], w[11], offset);
      w[23] = hc_bytealign_be (w[ 9], w[10], offset);
      w[22] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[21] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[20] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[19] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[18] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[17] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[16] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[15] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[14] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[13] = hc_bytealign_be (    0, w[ 0], offset);
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 14:
      w[63] = hc_bytealign_be (w[48], w[49], offset);
      w[62] = hc_bytealign_be (w[47], w[48], offset);
      w[61] = hc_bytealign_be (w[46], w[47], offset);
      w[60] = hc_bytealign_be (w[45], w[46], offset);
      w[59] = hc_bytealign_be (w[44], w[45], offset);
      w[58] = hc_bytealign_be (w[43], w[44], offset);
      w[57] = hc_bytealign_be (w[42], w[43], offset);
      w[56] = hc_bytealign_be (w[41], w[42], offset);
      w[55] = hc_bytealign_be (w[40], w[41], offset);
      w[54] = hc_bytealign_be (w[39], w[40], offset);
      w[53] = hc_bytealign_be (w[38], w[39], offset);
      w[52] = hc_bytealign_be (w[37], w[38], offset);
      w[51] = hc_bytealign_be (w[36], w[37], offset);
      w[50] = hc_bytealign_be (w[35], w[36], offset);
      w[49] = hc_bytealign_be (w[34], w[35], offset);
      w[48] = hc_bytealign_be (w[33], w[34], offset);
      w[47] = hc_bytealign_be (w[32], w[33], offset);
      w[46] = hc_bytealign_be (w[31], w[32], offset);
      w[45] = hc_bytealign_be (w[30], w[31], offset);
      w[44] = hc_bytealign_be (w[29], w[30], offset);
      w[43] = hc_bytealign_be (w[28], w[29], offset);
      w[42] = hc_bytealign_be (w[27], w[28], offset);
      w[41] = hc_bytealign_be (w[26], w[27], offset);
      w[40] = hc_bytealign_be (w[25], w[26], offset);
      w[39] = hc_bytealign_be (w[24], w[25], offset);
      w[38] = hc_bytealign_be (w[23], w[24], offset);
      w[37] = hc_bytealign_be (w[22], w[23], offset);
      w[36] = hc_bytealign_be (w[21], w[22], offset);
      w[35] = hc_bytealign_be (w[20], w[21], offset);
      w[34] = hc_bytealign_be (w[19], w[20], offset);
      w[33] = hc_bytealign_be (w[18], w[19], offset);
      w[32] = hc_bytealign_be (w[17], w[18], offset);
      w[31] = hc_bytealign_be (w[16], w[17], offset);
      w[30] = hc_bytealign_be (w[15], w[16], offset);
      w[29] = hc_bytealign_be (w[14], w[15], offset);
      w[28] = hc_bytealign_be (w[13], w[14], offset);
      w[27] = hc_bytealign_be (w[12], w[13], offset);
      w[26] = hc_bytealign_be (w[11], w[12], offset);
      w[25] = hc_bytealign_be (w[10], w[11], offset);
      w[24] = hc_bytealign_be (w[ 9], w[10], offset);
      w[23] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[22] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[21] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[20] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[19] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[18] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[17] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[16] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[15] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[14] = hc_bytealign_be (    0, w[ 0], offset);
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 15:
      w[63] = hc_bytealign_be (w[47], w[48], offset);
      w[62] = hc_bytealign_be (w[46], w[47], offset);
      w[61] = hc_bytealign_be (w[45], w[46], offset);
      w[60] = hc_bytealign_be (w[44], w[45], offset);
      w[59] = hc_bytealign_be (w[43], w[44], offset);
      w[58] = hc_bytealign_be (w[42], w[43], offset);
      w[57] = hc_bytealign_be (w[41], w[42], offset);
      w[56] = hc_bytealign_be (w[40], w[41], offset);
      w[55] = hc_bytealign_be (w[39], w[40], offset);
      w[54] = hc_bytealign_be (w[38], w[39], offset);
      w[53] = hc_bytealign_be (w[37], w[38], offset);
      w[52] = hc_bytealign_be (w[36], w[37], offset);
      w[51] = hc_bytealign_be (w[35], w[36], offset);
      w[50] = hc_bytealign_be (w[34], w[35], offset);
      w[49] = hc_bytealign_be (w[33], w[34], offset);
      w[48] = hc_bytealign_be (w[32], w[33], offset);
      w[47] = hc_bytealign_be (w[31], w[32], offset);
      w[46] = hc_bytealign_be (w[30], w[31], offset);
      w[45] = hc_bytealign_be (w[29], w[30], offset);
      w[44] = hc_bytealign_be (w[28], w[29], offset);
      w[43] = hc_bytealign_be (w[27], w[28], offset);
      w[42] = hc_bytealign_be (w[26], w[27], offset);
      w[41] = hc_bytealign_be (w[25], w[26], offset);
      w[40] = hc_bytealign_be (w[24], w[25], offset);
      w[39] = hc_bytealign_be (w[23], w[24], offset);
      w[38] = hc_bytealign_be (w[22], w[23], offset);
      w[37] = hc_bytealign_be (w[21], w[22], offset);
      w[36] = hc_bytealign_be (w[20], w[21], offset);
      w[35] = hc_bytealign_be (w[19], w[20], offset);
      w[34] = hc_bytealign_be (w[18], w[19], offset);
      w[33] = hc_bytealign_be (w[17], w[18], offset);
      w[32] = hc_bytealign_be (w[16], w[17], offset);
      w[31] = hc_bytealign_be (w[15], w[16], offset);
      w[30] = hc_bytealign_be (w[14], w[15], offset);
      w[29] = hc_bytealign_be (w[13], w[14], offset);
      w[28] = hc_bytealign_be (w[12], w[13], offset);
      w[27] = hc_bytealign_be (w[11], w[12], offset);
      w[26] = hc_bytealign_be (w[10], w[11], offset);
      w[25] = hc_bytealign_be (w[ 9], w[10], offset);
      w[24] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[23] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[22] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[21] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[20] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[19] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[18] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[17] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[16] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[15] = hc_bytealign_be (    0, w[ 0], offset);
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 16:
      w[63] = hc_bytealign_be (w[46], w[47], offset);
      w[62] = hc_bytealign_be (w[45], w[46], offset);
      w[61] = hc_bytealign_be (w[44], w[45], offset);
      w[60] = hc_bytealign_be (w[43], w[44], offset);
      w[59] = hc_bytealign_be (w[42], w[43], offset);
      w[58] = hc_bytealign_be (w[41], w[42], offset);
      w[57] = hc_bytealign_be (w[40], w[41], offset);
      w[56] = hc_bytealign_be (w[39], w[40], offset);
      w[55] = hc_bytealign_be (w[38], w[39], offset);
      w[54] = hc_bytealign_be (w[37], w[38], offset);
      w[53] = hc_bytealign_be (w[36], w[37], offset);
      w[52] = hc_bytealign_be (w[35], w[36], offset);
      w[51] = hc_bytealign_be (w[34], w[35], offset);
      w[50] = hc_bytealign_be (w[33], w[34], offset);
      w[49] = hc_bytealign_be (w[32], w[33], offset);
      w[48] = hc_bytealign_be (w[31], w[32], offset);
      w[47] = hc_bytealign_be (w[30], w[31], offset);
      w[46] = hc_bytealign_be (w[29], w[30], offset);
      w[45] = hc_bytealign_be (w[28], w[29], offset);
      w[44] = hc_bytealign_be (w[27], w[28], offset);
      w[43] = hc_bytealign_be (w[26], w[27], offset);
      w[42] = hc_bytealign_be (w[25], w[26], offset);
      w[41] = hc_bytealign_be (w[24], w[25], offset);
      w[40] = hc_bytealign_be (w[23], w[24], offset);
      w[39] = hc_bytealign_be (w[22], w[23], offset);
      w[38] = hc_bytealign_be (w[21], w[22], offset);
      w[37] = hc_bytealign_be (w[20], w[21], offset);
      w[36] = hc_bytealign_be (w[19], w[20], offset);
      w[35] = hc_bytealign_be (w[18], w[19], offset);
      w[34] = hc_bytealign_be (w[17], w[18], offset);
      w[33] = hc_bytealign_be (w[16], w[17], offset);
      w[32] = hc_bytealign_be (w[15], w[16], offset);
      w[31] = hc_bytealign_be (w[14], w[15], offset);
      w[30] = hc_bytealign_be (w[13], w[14], offset);
      w[29] = hc_bytealign_be (w[12], w[13], offset);
      w[28] = hc_bytealign_be (w[11], w[12], offset);
      w[27] = hc_bytealign_be (w[10], w[11], offset);
      w[26] = hc_bytealign_be (w[ 9], w[10], offset);
      w[25] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[24] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[23] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[22] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[21] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[20] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[19] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[18] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[17] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[16] = hc_bytealign_be (    0, w[ 0], offset);
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 17:
      w[63] = hc_bytealign_be (w[45], w[46], offset);
      w[62] = hc_bytealign_be (w[44], w[45], offset);
      w[61] = hc_bytealign_be (w[43], w[44], offset);
      w[60] = hc_bytealign_be (w[42], w[43], offset);
      w[59] = hc_bytealign_be (w[41], w[42], offset);
      w[58] = hc_bytealign_be (w[40], w[41], offset);
      w[57] = hc_bytealign_be (w[39], w[40], offset);
      w[56] = hc_bytealign_be (w[38], w[39], offset);
      w[55] = hc_bytealign_be (w[37], w[38], offset);
      w[54] = hc_bytealign_be (w[36], w[37], offset);
      w[53] = hc_bytealign_be (w[35], w[36], offset);
      w[52] = hc_bytealign_be (w[34], w[35], offset);
      w[51] = hc_bytealign_be (w[33], w[34], offset);
      w[50] = hc_bytealign_be (w[32], w[33], offset);
      w[49] = hc_bytealign_be (w[31], w[32], offset);
      w[48] = hc_bytealign_be (w[30], w[31], offset);
      w[47] = hc_bytealign_be (w[29], w[30], offset);
      w[46] = hc_bytealign_be (w[28], w[29], offset);
      w[45] = hc_bytealign_be (w[27], w[28], offset);
      w[44] = hc_bytealign_be (w[26], w[27], offset);
      w[43] = hc_bytealign_be (w[25], w[26], offset);
      w[42] = hc_bytealign_be (w[24], w[25], offset);
      w[41] = hc_bytealign_be (w[23], w[24], offset);
      w[40] = hc_bytealign_be (w[22], w[23], offset);
      w[39] = hc_bytealign_be (w[21], w[22], offset);
      w[38] = hc_bytealign_be (w[20], w[21], offset);
      w[37] = hc_bytealign_be (w[19], w[20], offset);
      w[36] = hc_bytealign_be (w[18], w[19], offset);
      w[35] = hc_bytealign_be (w[17], w[18], offset);
      w[34] = hc_bytealign_be (w[16], w[17], offset);
      w[33] = hc_bytealign_be (w[15], w[16], offset);
      w[32] = hc_bytealign_be (w[14], w[15], offset);
      w[31] = hc_bytealign_be (w[13], w[14], offset);
      w[30] = hc_bytealign_be (w[12], w[13], offset);
      w[29] = hc_bytealign_be (w[11], w[12], offset);
      w[28] = hc_bytealign_be (w[10], w[11], offset);
      w[27] = hc_bytealign_be (w[ 9], w[10], offset);
      w[26] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[25] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[24] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[23] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[22] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[21] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[20] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[19] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[18] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[17] = hc_bytealign_be (    0, w[ 0], offset);
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 18:
      w[63] = hc_bytealign_be (w[44], w[45], offset);
      w[62] = hc_bytealign_be (w[43], w[44], offset);
      w[61] = hc_bytealign_be (w[42], w[43], offset);
      w[60] = hc_bytealign_be (w[41], w[42], offset);
      w[59] = hc_bytealign_be (w[40], w[41], offset);
      w[58] = hc_bytealign_be (w[39], w[40], offset);
      w[57] = hc_bytealign_be (w[38], w[39], offset);
      w[56] = hc_bytealign_be (w[37], w[38], offset);
      w[55] = hc_bytealign_be (w[36], w[37], offset);
      w[54] = hc_bytealign_be (w[35], w[36], offset);
      w[53] = hc_bytealign_be (w[34], w[35], offset);
      w[52] = hc_bytealign_be (w[33], w[34], offset);
      w[51] = hc_bytealign_be (w[32], w[33], offset);
      w[50] = hc_bytealign_be (w[31], w[32], offset);
      w[49] = hc_bytealign_be (w[30], w[31], offset);
      w[48] = hc_bytealign_be (w[29], w[30], offset);
      w[47] = hc_bytealign_be (w[28], w[29], offset);
      w[46] = hc_bytealign_be (w[27], w[28], offset);
      w[45] = hc_bytealign_be (w[26], w[27], offset);
      w[44] = hc_bytealign_be (w[25], w[26], offset);
      w[43] = hc_bytealign_be (w[24], w[25], offset);
      w[42] = hc_bytealign_be (w[23], w[24], offset);
      w[41] = hc_bytealign_be (w[22], w[23], offset);
      w[40] = hc_bytealign_be (w[21], w[22], offset);
      w[39] = hc_bytealign_be (w[20], w[21], offset);
      w[38] = hc_bytealign_be (w[19], w[20], offset);
      w[37] = hc_bytealign_be (w[18], w[19], offset);
      w[36] = hc_bytealign_be (w[17], w[18], offset);
      w[35] = hc_bytealign_be (w[16], w[17], offset);
      w[34] = hc_bytealign_be (w[15], w[16], offset);
      w[33] = hc_bytealign_be (w[14], w[15], offset);
      w[32] = hc_bytealign_be (w[13], w[14], offset);
      w[31] = hc_bytealign_be (w[12], w[13], offset);
      w[30] = hc_bytealign_be (w[11], w[12], offset);
      w[29] = hc_bytealign_be (w[10], w[11], offset);
      w[28] = hc_bytealign_be (w[ 9], w[10], offset);
      w[27] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[26] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[25] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[24] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[23] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[22] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[21] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[20] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[19] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[18] = hc_bytealign_be (    0, w[ 0], offset);
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 19:
      w[63] = hc_bytealign_be (w[43], w[44], offset);
      w[62] = hc_bytealign_be (w[42], w[43], offset);
      w[61] = hc_bytealign_be (w[41], w[42], offset);
      w[60] = hc_bytealign_be (w[40], w[41], offset);
      w[59] = hc_bytealign_be (w[39], w[40], offset);
      w[58] = hc_bytealign_be (w[38], w[39], offset);
      w[57] = hc_bytealign_be (w[37], w[38], offset);
      w[56] = hc_bytealign_be (w[36], w[37], offset);
      w[55] = hc_bytealign_be (w[35], w[36], offset);
      w[54] = hc_bytealign_be (w[34], w[35], offset);
      w[53] = hc_bytealign_be (w[33], w[34], offset);
      w[52] = hc_bytealign_be (w[32], w[33], offset);
      w[51] = hc_bytealign_be (w[31], w[32], offset);
      w[50] = hc_bytealign_be (w[30], w[31], offset);
      w[49] = hc_bytealign_be (w[29], w[30], offset);
      w[48] = hc_bytealign_be (w[28], w[29], offset);
      w[47] = hc_bytealign_be (w[27], w[28], offset);
      w[46] = hc_bytealign_be (w[26], w[27], offset);
      w[45] = hc_bytealign_be (w[25], w[26], offset);
      w[44] = hc_bytealign_be (w[24], w[25], offset);
      w[43] = hc_bytealign_be (w[23], w[24], offset);
      w[42] = hc_bytealign_be (w[22], w[23], offset);
      w[41] = hc_bytealign_be (w[21], w[22], offset);
      w[40] = hc_bytealign_be (w[20], w[21], offset);
      w[39] = hc_bytealign_be (w[19], w[20], offset);
      w[38] = hc_bytealign_be (w[18], w[19], offset);
      w[37] = hc_bytealign_be (w[17], w[18], offset);
      w[36] = hc_bytealign_be (w[16], w[17], offset);
      w[35] = hc_bytealign_be (w[15], w[16], offset);
      w[34] = hc_bytealign_be (w[14], w[15], offset);
      w[33] = hc_bytealign_be (w[13], w[14], offset);
      w[32] = hc_bytealign_be (w[12], w[13], offset);
      w[31] = hc_bytealign_be (w[11], w[12], offset);
      w[30] = hc_bytealign_be (w[10], w[11], offset);
      w[29] = hc_bytealign_be (w[ 9], w[10], offset);
      w[28] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[27] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[26] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[25] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[24] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[23] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[22] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[21] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[20] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[19] = hc_bytealign_be (    0, w[ 0], offset);
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 20:
      w[63] = hc_bytealign_be (w[42], w[43], offset);
      w[62] = hc_bytealign_be (w[41], w[42], offset);
      w[61] = hc_bytealign_be (w[40], w[41], offset);
      w[60] = hc_bytealign_be (w[39], w[40], offset);
      w[59] = hc_bytealign_be (w[38], w[39], offset);
      w[58] = hc_bytealign_be (w[37], w[38], offset);
      w[57] = hc_bytealign_be (w[36], w[37], offset);
      w[56] = hc_bytealign_be (w[35], w[36], offset);
      w[55] = hc_bytealign_be (w[34], w[35], offset);
      w[54] = hc_bytealign_be (w[33], w[34], offset);
      w[53] = hc_bytealign_be (w[32], w[33], offset);
      w[52] = hc_bytealign_be (w[31], w[32], offset);
      w[51] = hc_bytealign_be (w[30], w[31], offset);
      w[50] = hc_bytealign_be (w[29], w[30], offset);
      w[49] = hc_bytealign_be (w[28], w[29], offset);
      w[48] = hc_bytealign_be (w[27], w[28], offset);
      w[47] = hc_bytealign_be (w[26], w[27], offset);
      w[46] = hc_bytealign_be (w[25], w[26], offset);
      w[45] = hc_bytealign_be (w[24], w[25], offset);
      w[44] = hc_bytealign_be (w[23], w[24], offset);
      w[43] = hc_bytealign_be (w[22], w[23], offset);
      w[42] = hc_bytealign_be (w[21], w[22], offset);
      w[41] = hc_bytealign_be (w[20], w[21], offset);
      w[40] = hc_bytealign_be (w[19], w[20], offset);
      w[39] = hc_bytealign_be (w[18], w[19], offset);
      w[38] = hc_bytealign_be (w[17], w[18], offset);
      w[37] = hc_bytealign_be (w[16], w[17], offset);
      w[36] = hc_bytealign_be (w[15], w[16], offset);
      w[35] = hc_bytealign_be (w[14], w[15], offset);
      w[34] = hc_bytealign_be (w[13], w[14], offset);
      w[33] = hc_bytealign_be (w[12], w[13], offset);
      w[32] = hc_bytealign_be (w[11], w[12], offset);
      w[31] = hc_bytealign_be (w[10], w[11], offset);
      w[30] = hc_bytealign_be (w[ 9], w[10], offset);
      w[29] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[28] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[27] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[26] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[25] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[24] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[23] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[22] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[21] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[20] = hc_bytealign_be (    0, w[ 0], offset);
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 21:
      w[63] = hc_bytealign_be (w[41], w[42], offset);
      w[62] = hc_bytealign_be (w[40], w[41], offset);
      w[61] = hc_bytealign_be (w[39], w[40], offset);
      w[60] = hc_bytealign_be (w[38], w[39], offset);
      w[59] = hc_bytealign_be (w[37], w[38], offset);
      w[58] = hc_bytealign_be (w[36], w[37], offset);
      w[57] = hc_bytealign_be (w[35], w[36], offset);
      w[56] = hc_bytealign_be (w[34], w[35], offset);
      w[55] = hc_bytealign_be (w[33], w[34], offset);
      w[54] = hc_bytealign_be (w[32], w[33], offset);
      w[53] = hc_bytealign_be (w[31], w[32], offset);
      w[52] = hc_bytealign_be (w[30], w[31], offset);
      w[51] = hc_bytealign_be (w[29], w[30], offset);
      w[50] = hc_bytealign_be (w[28], w[29], offset);
      w[49] = hc_bytealign_be (w[27], w[28], offset);
      w[48] = hc_bytealign_be (w[26], w[27], offset);
      w[47] = hc_bytealign_be (w[25], w[26], offset);
      w[46] = hc_bytealign_be (w[24], w[25], offset);
      w[45] = hc_bytealign_be (w[23], w[24], offset);
      w[44] = hc_bytealign_be (w[22], w[23], offset);
      w[43] = hc_bytealign_be (w[21], w[22], offset);
      w[42] = hc_bytealign_be (w[20], w[21], offset);
      w[41] = hc_bytealign_be (w[19], w[20], offset);
      w[40] = hc_bytealign_be (w[18], w[19], offset);
      w[39] = hc_bytealign_be (w[17], w[18], offset);
      w[38] = hc_bytealign_be (w[16], w[17], offset);
      w[37] = hc_bytealign_be (w[15], w[16], offset);
      w[36] = hc_bytealign_be (w[14], w[15], offset);
      w[35] = hc_bytealign_be (w[13], w[14], offset);
      w[34] = hc_bytealign_be (w[12], w[13], offset);
      w[33] = hc_bytealign_be (w[11], w[12], offset);
      w[32] = hc_bytealign_be (w[10], w[11], offset);
      w[31] = hc_bytealign_be (w[ 9], w[10], offset);
      w[30] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[29] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[28] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[27] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[26] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[25] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[24] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[23] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[22] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[21] = hc_bytealign_be (    0, w[ 0], offset);
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 22:
      w[63] = hc_bytealign_be (w[40], w[41], offset);
      w[62] = hc_bytealign_be (w[39], w[40], offset);
      w[61] = hc_bytealign_be (w[38], w[39], offset);
      w[60] = hc_bytealign_be (w[37], w[38], offset);
      w[59] = hc_bytealign_be (w[36], w[37], offset);
      w[58] = hc_bytealign_be (w[35], w[36], offset);
      w[57] = hc_bytealign_be (w[34], w[35], offset);
      w[56] = hc_bytealign_be (w[33], w[34], offset);
      w[55] = hc_bytealign_be (w[32], w[33], offset);
      w[54] = hc_bytealign_be (w[31], w[32], offset);
      w[53] = hc_bytealign_be (w[30], w[31], offset);
      w[52] = hc_bytealign_be (w[29], w[30], offset);
      w[51] = hc_bytealign_be (w[28], w[29], offset);
      w[50] = hc_bytealign_be (w[27], w[28], offset);
      w[49] = hc_bytealign_be (w[26], w[27], offset);
      w[48] = hc_bytealign_be (w[25], w[26], offset);
      w[47] = hc_bytealign_be (w[24], w[25], offset);
      w[46] = hc_bytealign_be (w[23], w[24], offset);
      w[45] = hc_bytealign_be (w[22], w[23], offset);
      w[44] = hc_bytealign_be (w[21], w[22], offset);
      w[43] = hc_bytealign_be (w[20], w[21], offset);
      w[42] = hc_bytealign_be (w[19], w[20], offset);
      w[41] = hc_bytealign_be (w[18], w[19], offset);
      w[40] = hc_bytealign_be (w[17], w[18], offset);
      w[39] = hc_bytealign_be (w[16], w[17], offset);
      w[38] = hc_bytealign_be (w[15], w[16], offset);
      w[37] = hc_bytealign_be (w[14], w[15], offset);
      w[36] = hc_bytealign_be (w[13], w[14], offset);
      w[35] = hc_bytealign_be (w[12], w[13], offset);
      w[34] = hc_bytealign_be (w[11], w[12], offset);
      w[33] = hc_bytealign_be (w[10], w[11], offset);
      w[32] = hc_bytealign_be (w[ 9], w[10], offset);
      w[31] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[30] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[29] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[28] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[27] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[26] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[25] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[24] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[23] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[22] = hc_bytealign_be (    0, w[ 0], offset);
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 23:
      w[63] = hc_bytealign_be (w[39], w[40], offset);
      w[62] = hc_bytealign_be (w[38], w[39], offset);
      w[61] = hc_bytealign_be (w[37], w[38], offset);
      w[60] = hc_bytealign_be (w[36], w[37], offset);
      w[59] = hc_bytealign_be (w[35], w[36], offset);
      w[58] = hc_bytealign_be (w[34], w[35], offset);
      w[57] = hc_bytealign_be (w[33], w[34], offset);
      w[56] = hc_bytealign_be (w[32], w[33], offset);
      w[55] = hc_bytealign_be (w[31], w[32], offset);
      w[54] = hc_bytealign_be (w[30], w[31], offset);
      w[53] = hc_bytealign_be (w[29], w[30], offset);
      w[52] = hc_bytealign_be (w[28], w[29], offset);
      w[51] = hc_bytealign_be (w[27], w[28], offset);
      w[50] = hc_bytealign_be (w[26], w[27], offset);
      w[49] = hc_bytealign_be (w[25], w[26], offset);
      w[48] = hc_bytealign_be (w[24], w[25], offset);
      w[47] = hc_bytealign_be (w[23], w[24], offset);
      w[46] = hc_bytealign_be (w[22], w[23], offset);
      w[45] = hc_bytealign_be (w[21], w[22], offset);
      w[44] = hc_bytealign_be (w[20], w[21], offset);
      w[43] = hc_bytealign_be (w[19], w[20], offset);
      w[42] = hc_bytealign_be (w[18], w[19], offset);
      w[41] = hc_bytealign_be (w[17], w[18], offset);
      w[40] = hc_bytealign_be (w[16], w[17], offset);
      w[39] = hc_bytealign_be (w[15], w[16], offset);
      w[38] = hc_bytealign_be (w[14], w[15], offset);
      w[37] = hc_bytealign_be (w[13], w[14], offset);
      w[36] = hc_bytealign_be (w[12], w[13], offset);
      w[35] = hc_bytealign_be (w[11], w[12], offset);
      w[34] = hc_bytealign_be (w[10], w[11], offset);
      w[33] = hc_bytealign_be (w[ 9], w[10], offset);
      w[32] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[31] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[30] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[29] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[28] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[27] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[26] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[25] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[24] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[23] = hc_bytealign_be (    0, w[ 0], offset);
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 24:
      w[63] = hc_bytealign_be (w[38], w[39], offset);
      w[62] = hc_bytealign_be (w[37], w[38], offset);
      w[61] = hc_bytealign_be (w[36], w[37], offset);
      w[60] = hc_bytealign_be (w[35], w[36], offset);
      w[59] = hc_bytealign_be (w[34], w[35], offset);
      w[58] = hc_bytealign_be (w[33], w[34], offset);
      w[57] = hc_bytealign_be (w[32], w[33], offset);
      w[56] = hc_bytealign_be (w[31], w[32], offset);
      w[55] = hc_bytealign_be (w[30], w[31], offset);
      w[54] = hc_bytealign_be (w[29], w[30], offset);
      w[53] = hc_bytealign_be (w[28], w[29], offset);
      w[52] = hc_bytealign_be (w[27], w[28], offset);
      w[51] = hc_bytealign_be (w[26], w[27], offset);
      w[50] = hc_bytealign_be (w[25], w[26], offset);
      w[49] = hc_bytealign_be (w[24], w[25], offset);
      w[48] = hc_bytealign_be (w[23], w[24], offset);
      w[47] = hc_bytealign_be (w[22], w[23], offset);
      w[46] = hc_bytealign_be (w[21], w[22], offset);
      w[45] = hc_bytealign_be (w[20], w[21], offset);
      w[44] = hc_bytealign_be (w[19], w[20], offset);
      w[43] = hc_bytealign_be (w[18], w[19], offset);
      w[42] = hc_bytealign_be (w[17], w[18], offset);
      w[41] = hc_bytealign_be (w[16], w[17], offset);
      w[40] = hc_bytealign_be (w[15], w[16], offset);
      w[39] = hc_bytealign_be (w[14], w[15], offset);
      w[38] = hc_bytealign_be (w[13], w[14], offset);
      w[37] = hc_bytealign_be (w[12], w[13], offset);
      w[36] = hc_bytealign_be (w[11], w[12], offset);
      w[35] = hc_bytealign_be (w[10], w[11], offset);
      w[34] = hc_bytealign_be (w[ 9], w[10], offset);
      w[33] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[32] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[31] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[30] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[29] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[28] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[27] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[26] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[25] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[24] = hc_bytealign_be (    0, w[ 0], offset);
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 25:
      w[63] = hc_bytealign_be (w[37], w[38], offset);
      w[62] = hc_bytealign_be (w[36], w[37], offset);
      w[61] = hc_bytealign_be (w[35], w[36], offset);
      w[60] = hc_bytealign_be (w[34], w[35], offset);
      w[59] = hc_bytealign_be (w[33], w[34], offset);
      w[58] = hc_bytealign_be (w[32], w[33], offset);
      w[57] = hc_bytealign_be (w[31], w[32], offset);
      w[56] = hc_bytealign_be (w[30], w[31], offset);
      w[55] = hc_bytealign_be (w[29], w[30], offset);
      w[54] = hc_bytealign_be (w[28], w[29], offset);
      w[53] = hc_bytealign_be (w[27], w[28], offset);
      w[52] = hc_bytealign_be (w[26], w[27], offset);
      w[51] = hc_bytealign_be (w[25], w[26], offset);
      w[50] = hc_bytealign_be (w[24], w[25], offset);
      w[49] = hc_bytealign_be (w[23], w[24], offset);
      w[48] = hc_bytealign_be (w[22], w[23], offset);
      w[47] = hc_bytealign_be (w[21], w[22], offset);
      w[46] = hc_bytealign_be (w[20], w[21], offset);
      w[45] = hc_bytealign_be (w[19], w[20], offset);
      w[44] = hc_bytealign_be (w[18], w[19], offset);
      w[43] = hc_bytealign_be (w[17], w[18], offset);
      w[42] = hc_bytealign_be (w[16], w[17], offset);
      w[41] = hc_bytealign_be (w[15], w[16], offset);
      w[40] = hc_bytealign_be (w[14], w[15], offset);
      w[39] = hc_bytealign_be (w[13], w[14], offset);
      w[38] = hc_bytealign_be (w[12], w[13], offset);
      w[37] = hc_bytealign_be (w[11], w[12], offset);
      w[36] = hc_bytealign_be (w[10], w[11], offset);
      w[35] = hc_bytealign_be (w[ 9], w[10], offset);
      w[34] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[33] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[32] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[31] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[30] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[29] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[28] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[27] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[26] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[25] = hc_bytealign_be (    0, w[ 0], offset);
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 26:
      w[63] = hc_bytealign_be (w[36], w[37], offset);
      w[62] = hc_bytealign_be (w[35], w[36], offset);
      w[61] = hc_bytealign_be (w[34], w[35], offset);
      w[60] = hc_bytealign_be (w[33], w[34], offset);
      w[59] = hc_bytealign_be (w[32], w[33], offset);
      w[58] = hc_bytealign_be (w[31], w[32], offset);
      w[57] = hc_bytealign_be (w[30], w[31], offset);
      w[56] = hc_bytealign_be (w[29], w[30], offset);
      w[55] = hc_bytealign_be (w[28], w[29], offset);
      w[54] = hc_bytealign_be (w[27], w[28], offset);
      w[53] = hc_bytealign_be (w[26], w[27], offset);
      w[52] = hc_bytealign_be (w[25], w[26], offset);
      w[51] = hc_bytealign_be (w[24], w[25], offset);
      w[50] = hc_bytealign_be (w[23], w[24], offset);
      w[49] = hc_bytealign_be (w[22], w[23], offset);
      w[48] = hc_bytealign_be (w[21], w[22], offset);
      w[47] = hc_bytealign_be (w[20], w[21], offset);
      w[46] = hc_bytealign_be (w[19], w[20], offset);
      w[45] = hc_bytealign_be (w[18], w[19], offset);
      w[44] = hc_bytealign_be (w[17], w[18], offset);
      w[43] = hc_bytealign_be (w[16], w[17], offset);
      w[42] = hc_bytealign_be (w[15], w[16], offset);
      w[41] = hc_bytealign_be (w[14], w[15], offset);
      w[40] = hc_bytealign_be (w[13], w[14], offset);
      w[39] = hc_bytealign_be (w[12], w[13], offset);
      w[38] = hc_bytealign_be (w[11], w[12], offset);
      w[37] = hc_bytealign_be (w[10], w[11], offset);
      w[36] = hc_bytealign_be (w[ 9], w[10], offset);
      w[35] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[34] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[33] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[32] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[31] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[30] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[29] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[28] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[27] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[26] = hc_bytealign_be (    0, w[ 0], offset);
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 27:
      w[63] = hc_bytealign_be (w[35], w[36], offset);
      w[62] = hc_bytealign_be (w[34], w[35], offset);
      w[61] = hc_bytealign_be (w[33], w[34], offset);
      w[60] = hc_bytealign_be (w[32], w[33], offset);
      w[59] = hc_bytealign_be (w[31], w[32], offset);
      w[58] = hc_bytealign_be (w[30], w[31], offset);
      w[57] = hc_bytealign_be (w[29], w[30], offset);
      w[56] = hc_bytealign_be (w[28], w[29], offset);
      w[55] = hc_bytealign_be (w[27], w[28], offset);
      w[54] = hc_bytealign_be (w[26], w[27], offset);
      w[53] = hc_bytealign_be (w[25], w[26], offset);
      w[52] = hc_bytealign_be (w[24], w[25], offset);
      w[51] = hc_bytealign_be (w[23], w[24], offset);
      w[50] = hc_bytealign_be (w[22], w[23], offset);
      w[49] = hc_bytealign_be (w[21], w[22], offset);
      w[48] = hc_bytealign_be (w[20], w[21], offset);
      w[47] = hc_bytealign_be (w[19], w[20], offset);
      w[46] = hc_bytealign_be (w[18], w[19], offset);
      w[45] = hc_bytealign_be (w[17], w[18], offset);
      w[44] = hc_bytealign_be (w[16], w[17], offset);
      w[43] = hc_bytealign_be (w[15], w[16], offset);
      w[42] = hc_bytealign_be (w[14], w[15], offset);
      w[41] = hc_bytealign_be (w[13], w[14], offset);
      w[40] = hc_bytealign_be (w[12], w[13], offset);
      w[39] = hc_bytealign_be (w[11], w[12], offset);
      w[38] = hc_bytealign_be (w[10], w[11], offset);
      w[37] = hc_bytealign_be (w[ 9], w[10], offset);
      w[36] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[35] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[34] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[33] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[32] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[31] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[30] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[29] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[28] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[27] = hc_bytealign_be (    0, w[ 0], offset);
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 28:
      w[63] = hc_bytealign_be (w[34], w[35], offset);
      w[62] = hc_bytealign_be (w[33], w[34], offset);
      w[61] = hc_bytealign_be (w[32], w[33], offset);
      w[60] = hc_bytealign_be (w[31], w[32], offset);
      w[59] = hc_bytealign_be (w[30], w[31], offset);
      w[58] = hc_bytealign_be (w[29], w[30], offset);
      w[57] = hc_bytealign_be (w[28], w[29], offset);
      w[56] = hc_bytealign_be (w[27], w[28], offset);
      w[55] = hc_bytealign_be (w[26], w[27], offset);
      w[54] = hc_bytealign_be (w[25], w[26], offset);
      w[53] = hc_bytealign_be (w[24], w[25], offset);
      w[52] = hc_bytealign_be (w[23], w[24], offset);
      w[51] = hc_bytealign_be (w[22], w[23], offset);
      w[50] = hc_bytealign_be (w[21], w[22], offset);
      w[49] = hc_bytealign_be (w[20], w[21], offset);
      w[48] = hc_bytealign_be (w[19], w[20], offset);
      w[47] = hc_bytealign_be (w[18], w[19], offset);
      w[46] = hc_bytealign_be (w[17], w[18], offset);
      w[45] = hc_bytealign_be (w[16], w[17], offset);
      w[44] = hc_bytealign_be (w[15], w[16], offset);
      w[43] = hc_bytealign_be (w[14], w[15], offset);
      w[42] = hc_bytealign_be (w[13], w[14], offset);
      w[41] = hc_bytealign_be (w[12], w[13], offset);
      w[40] = hc_bytealign_be (w[11], w[12], offset);
      w[39] = hc_bytealign_be (w[10], w[11], offset);
      w[38] = hc_bytealign_be (w[ 9], w[10], offset);
      w[37] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[36] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[35] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[34] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[33] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[32] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[31] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[30] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[29] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[28] = hc_bytealign_be (    0, w[ 0], offset);
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 29:
      w[63] = hc_bytealign_be (w[33], w[34], offset);
      w[62] = hc_bytealign_be (w[32], w[33], offset);
      w[61] = hc_bytealign_be (w[31], w[32], offset);
      w[60] = hc_bytealign_be (w[30], w[31], offset);
      w[59] = hc_bytealign_be (w[29], w[30], offset);
      w[58] = hc_bytealign_be (w[28], w[29], offset);
      w[57] = hc_bytealign_be (w[27], w[28], offset);
      w[56] = hc_bytealign_be (w[26], w[27], offset);
      w[55] = hc_bytealign_be (w[25], w[26], offset);
      w[54] = hc_bytealign_be (w[24], w[25], offset);
      w[53] = hc_bytealign_be (w[23], w[24], offset);
      w[52] = hc_bytealign_be (w[22], w[23], offset);
      w[51] = hc_bytealign_be (w[21], w[22], offset);
      w[50] = hc_bytealign_be (w[20], w[21], offset);
      w[49] = hc_bytealign_be (w[19], w[20], offset);
      w[48] = hc_bytealign_be (w[18], w[19], offset);
      w[47] = hc_bytealign_be (w[17], w[18], offset);
      w[46] = hc_bytealign_be (w[16], w[17], offset);
      w[45] = hc_bytealign_be (w[15], w[16], offset);
      w[44] = hc_bytealign_be (w[14], w[15], offset);
      w[43] = hc_bytealign_be (w[13], w[14], offset);
      w[42] = hc_bytealign_be (w[12], w[13], offset);
      w[41] = hc_bytealign_be (w[11], w[12], offset);
      w[40] = hc_bytealign_be (w[10], w[11], offset);
      w[39] = hc_bytealign_be (w[ 9], w[10], offset);
      w[38] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[37] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[36] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[35] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[34] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[33] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[32] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[31] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[30] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[29] = hc_bytealign_be (    0, w[ 0], offset);
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 30:
      w[63] = hc_bytealign_be (w[32], w[33], offset);
      w[62] = hc_bytealign_be (w[31], w[32], offset);
      w[61] = hc_bytealign_be (w[30], w[31], offset);
      w[60] = hc_bytealign_be (w[29], w[30], offset);
      w[59] = hc_bytealign_be (w[28], w[29], offset);
      w[58] = hc_bytealign_be (w[27], w[28], offset);
      w[57] = hc_bytealign_be (w[26], w[27], offset);
      w[56] = hc_bytealign_be (w[25], w[26], offset);
      w[55] = hc_bytealign_be (w[24], w[25], offset);
      w[54] = hc_bytealign_be (w[23], w[24], offset);
      w[53] = hc_bytealign_be (w[22], w[23], offset);
      w[52] = hc_bytealign_be (w[21], w[22], offset);
      w[51] = hc_bytealign_be (w[20], w[21], offset);
      w[50] = hc_bytealign_be (w[19], w[20], offset);
      w[49] = hc_bytealign_be (w[18], w[19], offset);
      w[48] = hc_bytealign_be (w[17], w[18], offset);
      w[47] = hc_bytealign_be (w[16], w[17], offset);
      w[46] = hc_bytealign_be (w[15], w[16], offset);
      w[45] = hc_bytealign_be (w[14], w[15], offset);
      w[44] = hc_bytealign_be (w[13], w[14], offset);
      w[43] = hc_bytealign_be (w[12], w[13], offset);
      w[42] = hc_bytealign_be (w[11], w[12], offset);
      w[41] = hc_bytealign_be (w[10], w[11], offset);
      w[40] = hc_bytealign_be (w[ 9], w[10], offset);
      w[39] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[38] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[37] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[36] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[35] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[34] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[33] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[32] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[31] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[30] = hc_bytealign_be (    0, w[ 0], offset);
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 31:
      w[63] = hc_bytealign_be (w[31], w[32], offset);
      w[62] = hc_bytealign_be (w[30], w[31], offset);
      w[61] = hc_bytealign_be (w[29], w[30], offset);
      w[60] = hc_bytealign_be (w[28], w[29], offset);
      w[59] = hc_bytealign_be (w[27], w[28], offset);
      w[58] = hc_bytealign_be (w[26], w[27], offset);
      w[57] = hc_bytealign_be (w[25], w[26], offset);
      w[56] = hc_bytealign_be (w[24], w[25], offset);
      w[55] = hc_bytealign_be (w[23], w[24], offset);
      w[54] = hc_bytealign_be (w[22], w[23], offset);
      w[53] = hc_bytealign_be (w[21], w[22], offset);
      w[52] = hc_bytealign_be (w[20], w[21], offset);
      w[51] = hc_bytealign_be (w[19], w[20], offset);
      w[50] = hc_bytealign_be (w[18], w[19], offset);
      w[49] = hc_bytealign_be (w[17], w[18], offset);
      w[48] = hc_bytealign_be (w[16], w[17], offset);
      w[47] = hc_bytealign_be (w[15], w[16], offset);
      w[46] = hc_bytealign_be (w[14], w[15], offset);
      w[45] = hc_bytealign_be (w[13], w[14], offset);
      w[44] = hc_bytealign_be (w[12], w[13], offset);
      w[43] = hc_bytealign_be (w[11], w[12], offset);
      w[42] = hc_bytealign_be (w[10], w[11], offset);
      w[41] = hc_bytealign_be (w[ 9], w[10], offset);
      w[40] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[39] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[38] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[37] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[36] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[35] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[34] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[33] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[32] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[31] = hc_bytealign_be (    0, w[ 0], offset);
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 32:
      w[63] = hc_bytealign_be (w[30], w[31], offset);
      w[62] = hc_bytealign_be (w[29], w[30], offset);
      w[61] = hc_bytealign_be (w[28], w[29], offset);
      w[60] = hc_bytealign_be (w[27], w[28], offset);
      w[59] = hc_bytealign_be (w[26], w[27], offset);
      w[58] = hc_bytealign_be (w[25], w[26], offset);
      w[57] = hc_bytealign_be (w[24], w[25], offset);
      w[56] = hc_bytealign_be (w[23], w[24], offset);
      w[55] = hc_bytealign_be (w[22], w[23], offset);
      w[54] = hc_bytealign_be (w[21], w[22], offset);
      w[53] = hc_bytealign_be (w[20], w[21], offset);
      w[52] = hc_bytealign_be (w[19], w[20], offset);
      w[51] = hc_bytealign_be (w[18], w[19], offset);
      w[50] = hc_bytealign_be (w[17], w[18], offset);
      w[49] = hc_bytealign_be (w[16], w[17], offset);
      w[48] = hc_bytealign_be (w[15], w[16], offset);
      w[47] = hc_bytealign_be (w[14], w[15], offset);
      w[46] = hc_bytealign_be (w[13], w[14], offset);
      w[45] = hc_bytealign_be (w[12], w[13], offset);
      w[44] = hc_bytealign_be (w[11], w[12], offset);
      w[43] = hc_bytealign_be (w[10], w[11], offset);
      w[42] = hc_bytealign_be (w[ 9], w[10], offset);
      w[41] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[40] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[39] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[38] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[37] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[36] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[35] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[34] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[33] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[32] = hc_bytealign_be (    0, w[ 0], offset);
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 33:
      w[63] = hc_bytealign_be (w[29], w[30], offset);
      w[62] = hc_bytealign_be (w[28], w[29], offset);
      w[61] = hc_bytealign_be (w[27], w[28], offset);
      w[60] = hc_bytealign_be (w[26], w[27], offset);
      w[59] = hc_bytealign_be (w[25], w[26], offset);
      w[58] = hc_bytealign_be (w[24], w[25], offset);
      w[57] = hc_bytealign_be (w[23], w[24], offset);
      w[56] = hc_bytealign_be (w[22], w[23], offset);
      w[55] = hc_bytealign_be (w[21], w[22], offset);
      w[54] = hc_bytealign_be (w[20], w[21], offset);
      w[53] = hc_bytealign_be (w[19], w[20], offset);
      w[52] = hc_bytealign_be (w[18], w[19], offset);
      w[51] = hc_bytealign_be (w[17], w[18], offset);
      w[50] = hc_bytealign_be (w[16], w[17], offset);
      w[49] = hc_bytealign_be (w[15], w[16], offset);
      w[48] = hc_bytealign_be (w[14], w[15], offset);
      w[47] = hc_bytealign_be (w[13], w[14], offset);
      w[46] = hc_bytealign_be (w[12], w[13], offset);
      w[45] = hc_bytealign_be (w[11], w[12], offset);
      w[44] = hc_bytealign_be (w[10], w[11], offset);
      w[43] = hc_bytealign_be (w[ 9], w[10], offset);
      w[42] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[41] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[40] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[39] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[38] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[37] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[36] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[35] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[34] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[33] = hc_bytealign_be (    0, w[ 0], offset);
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 34:
      w[63] = hc_bytealign_be (w[28], w[29], offset);
      w[62] = hc_bytealign_be (w[27], w[28], offset);
      w[61] = hc_bytealign_be (w[26], w[27], offset);
      w[60] = hc_bytealign_be (w[25], w[26], offset);
      w[59] = hc_bytealign_be (w[24], w[25], offset);
      w[58] = hc_bytealign_be (w[23], w[24], offset);
      w[57] = hc_bytealign_be (w[22], w[23], offset);
      w[56] = hc_bytealign_be (w[21], w[22], offset);
      w[55] = hc_bytealign_be (w[20], w[21], offset);
      w[54] = hc_bytealign_be (w[19], w[20], offset);
      w[53] = hc_bytealign_be (w[18], w[19], offset);
      w[52] = hc_bytealign_be (w[17], w[18], offset);
      w[51] = hc_bytealign_be (w[16], w[17], offset);
      w[50] = hc_bytealign_be (w[15], w[16], offset);
      w[49] = hc_bytealign_be (w[14], w[15], offset);
      w[48] = hc_bytealign_be (w[13], w[14], offset);
      w[47] = hc_bytealign_be (w[12], w[13], offset);
      w[46] = hc_bytealign_be (w[11], w[12], offset);
      w[45] = hc_bytealign_be (w[10], w[11], offset);
      w[44] = hc_bytealign_be (w[ 9], w[10], offset);
      w[43] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[42] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[41] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[40] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[39] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[38] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[37] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[36] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[35] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[34] = hc_bytealign_be (    0, w[ 0], offset);
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 35:
      w[63] = hc_bytealign_be (w[27], w[28], offset);
      w[62] = hc_bytealign_be (w[26], w[27], offset);
      w[61] = hc_bytealign_be (w[25], w[26], offset);
      w[60] = hc_bytealign_be (w[24], w[25], offset);
      w[59] = hc_bytealign_be (w[23], w[24], offset);
      w[58] = hc_bytealign_be (w[22], w[23], offset);
      w[57] = hc_bytealign_be (w[21], w[22], offset);
      w[56] = hc_bytealign_be (w[20], w[21], offset);
      w[55] = hc_bytealign_be (w[19], w[20], offset);
      w[54] = hc_bytealign_be (w[18], w[19], offset);
      w[53] = hc_bytealign_be (w[17], w[18], offset);
      w[52] = hc_bytealign_be (w[16], w[17], offset);
      w[51] = hc_bytealign_be (w[15], w[16], offset);
      w[50] = hc_bytealign_be (w[14], w[15], offset);
      w[49] = hc_bytealign_be (w[13], w[14], offset);
      w[48] = hc_bytealign_be (w[12], w[13], offset);
      w[47] = hc_bytealign_be (w[11], w[12], offset);
      w[46] = hc_bytealign_be (w[10], w[11], offset);
      w[45] = hc_bytealign_be (w[ 9], w[10], offset);
      w[44] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[43] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[42] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[41] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[40] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[39] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[38] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[37] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[36] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[35] = hc_bytealign_be (    0, w[ 0], offset);
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 36:
      w[63] = hc_bytealign_be (w[26], w[27], offset);
      w[62] = hc_bytealign_be (w[25], w[26], offset);
      w[61] = hc_bytealign_be (w[24], w[25], offset);
      w[60] = hc_bytealign_be (w[23], w[24], offset);
      w[59] = hc_bytealign_be (w[22], w[23], offset);
      w[58] = hc_bytealign_be (w[21], w[22], offset);
      w[57] = hc_bytealign_be (w[20], w[21], offset);
      w[56] = hc_bytealign_be (w[19], w[20], offset);
      w[55] = hc_bytealign_be (w[18], w[19], offset);
      w[54] = hc_bytealign_be (w[17], w[18], offset);
      w[53] = hc_bytealign_be (w[16], w[17], offset);
      w[52] = hc_bytealign_be (w[15], w[16], offset);
      w[51] = hc_bytealign_be (w[14], w[15], offset);
      w[50] = hc_bytealign_be (w[13], w[14], offset);
      w[49] = hc_bytealign_be (w[12], w[13], offset);
      w[48] = hc_bytealign_be (w[11], w[12], offset);
      w[47] = hc_bytealign_be (w[10], w[11], offset);
      w[46] = hc_bytealign_be (w[ 9], w[10], offset);
      w[45] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[44] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[43] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[42] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[41] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[40] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[39] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[38] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[37] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[36] = hc_bytealign_be (    0, w[ 0], offset);
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 37:
      w[63] = hc_bytealign_be (w[25], w[26], offset);
      w[62] = hc_bytealign_be (w[24], w[25], offset);
      w[61] = hc_bytealign_be (w[23], w[24], offset);
      w[60] = hc_bytealign_be (w[22], w[23], offset);
      w[59] = hc_bytealign_be (w[21], w[22], offset);
      w[58] = hc_bytealign_be (w[20], w[21], offset);
      w[57] = hc_bytealign_be (w[19], w[20], offset);
      w[56] = hc_bytealign_be (w[18], w[19], offset);
      w[55] = hc_bytealign_be (w[17], w[18], offset);
      w[54] = hc_bytealign_be (w[16], w[17], offset);
      w[53] = hc_bytealign_be (w[15], w[16], offset);
      w[52] = hc_bytealign_be (w[14], w[15], offset);
      w[51] = hc_bytealign_be (w[13], w[14], offset);
      w[50] = hc_bytealign_be (w[12], w[13], offset);
      w[49] = hc_bytealign_be (w[11], w[12], offset);
      w[48] = hc_bytealign_be (w[10], w[11], offset);
      w[47] = hc_bytealign_be (w[ 9], w[10], offset);
      w[46] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[45] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[44] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[43] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[42] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[41] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[40] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[39] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[38] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[37] = hc_bytealign_be (    0, w[ 0], offset);
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 38:
      w[63] = hc_bytealign_be (w[24], w[25], offset);
      w[62] = hc_bytealign_be (w[23], w[24], offset);
      w[61] = hc_bytealign_be (w[22], w[23], offset);
      w[60] = hc_bytealign_be (w[21], w[22], offset);
      w[59] = hc_bytealign_be (w[20], w[21], offset);
      w[58] = hc_bytealign_be (w[19], w[20], offset);
      w[57] = hc_bytealign_be (w[18], w[19], offset);
      w[56] = hc_bytealign_be (w[17], w[18], offset);
      w[55] = hc_bytealign_be (w[16], w[17], offset);
      w[54] = hc_bytealign_be (w[15], w[16], offset);
      w[53] = hc_bytealign_be (w[14], w[15], offset);
      w[52] = hc_bytealign_be (w[13], w[14], offset);
      w[51] = hc_bytealign_be (w[12], w[13], offset);
      w[50] = hc_bytealign_be (w[11], w[12], offset);
      w[49] = hc_bytealign_be (w[10], w[11], offset);
      w[48] = hc_bytealign_be (w[ 9], w[10], offset);
      w[47] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[46] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[45] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[44] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[43] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[42] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[41] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[40] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[39] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[38] = hc_bytealign_be (    0, w[ 0], offset);
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 39:
      w[63] = hc_bytealign_be (w[23], w[24], offset);
      w[62] = hc_bytealign_be (w[22], w[23], offset);
      w[61] = hc_bytealign_be (w[21], w[22], offset);
      w[60] = hc_bytealign_be (w[20], w[21], offset);
      w[59] = hc_bytealign_be (w[19], w[20], offset);
      w[58] = hc_bytealign_be (w[18], w[19], offset);
      w[57] = hc_bytealign_be (w[17], w[18], offset);
      w[56] = hc_bytealign_be (w[16], w[17], offset);
      w[55] = hc_bytealign_be (w[15], w[16], offset);
      w[54] = hc_bytealign_be (w[14], w[15], offset);
      w[53] = hc_bytealign_be (w[13], w[14], offset);
      w[52] = hc_bytealign_be (w[12], w[13], offset);
      w[51] = hc_bytealign_be (w[11], w[12], offset);
      w[50] = hc_bytealign_be (w[10], w[11], offset);
      w[49] = hc_bytealign_be (w[ 9], w[10], offset);
      w[48] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[47] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[46] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[45] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[44] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[43] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[42] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[41] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[40] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[39] = hc_bytealign_be (    0, w[ 0], offset);
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 40:
      w[63] = hc_bytealign_be (w[22], w[23], offset);
      w[62] = hc_bytealign_be (w[21], w[22], offset);
      w[61] = hc_bytealign_be (w[20], w[21], offset);
      w[60] = hc_bytealign_be (w[19], w[20], offset);
      w[59] = hc_bytealign_be (w[18], w[19], offset);
      w[58] = hc_bytealign_be (w[17], w[18], offset);
      w[57] = hc_bytealign_be (w[16], w[17], offset);
      w[56] = hc_bytealign_be (w[15], w[16], offset);
      w[55] = hc_bytealign_be (w[14], w[15], offset);
      w[54] = hc_bytealign_be (w[13], w[14], offset);
      w[53] = hc_bytealign_be (w[12], w[13], offset);
      w[52] = hc_bytealign_be (w[11], w[12], offset);
      w[51] = hc_bytealign_be (w[10], w[11], offset);
      w[50] = hc_bytealign_be (w[ 9], w[10], offset);
      w[49] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[48] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[47] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[46] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[45] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[44] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[43] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[42] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[41] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[40] = hc_bytealign_be (    0, w[ 0], offset);
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 41:
      w[63] = hc_bytealign_be (w[21], w[22], offset);
      w[62] = hc_bytealign_be (w[20], w[21], offset);
      w[61] = hc_bytealign_be (w[19], w[20], offset);
      w[60] = hc_bytealign_be (w[18], w[19], offset);
      w[59] = hc_bytealign_be (w[17], w[18], offset);
      w[58] = hc_bytealign_be (w[16], w[17], offset);
      w[57] = hc_bytealign_be (w[15], w[16], offset);
      w[56] = hc_bytealign_be (w[14], w[15], offset);
      w[55] = hc_bytealign_be (w[13], w[14], offset);
      w[54] = hc_bytealign_be (w[12], w[13], offset);
      w[53] = hc_bytealign_be (w[11], w[12], offset);
      w[52] = hc_bytealign_be (w[10], w[11], offset);
      w[51] = hc_bytealign_be (w[ 9], w[10], offset);
      w[50] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[49] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[48] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[47] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[46] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[45] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[44] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[43] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[42] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[41] = hc_bytealign_be (    0, w[ 0], offset);
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 42:
      w[63] = hc_bytealign_be (w[20], w[21], offset);
      w[62] = hc_bytealign_be (w[19], w[20], offset);
      w[61] = hc_bytealign_be (w[18], w[19], offset);
      w[60] = hc_bytealign_be (w[17], w[18], offset);
      w[59] = hc_bytealign_be (w[16], w[17], offset);
      w[58] = hc_bytealign_be (w[15], w[16], offset);
      w[57] = hc_bytealign_be (w[14], w[15], offset);
      w[56] = hc_bytealign_be (w[13], w[14], offset);
      w[55] = hc_bytealign_be (w[12], w[13], offset);
      w[54] = hc_bytealign_be (w[11], w[12], offset);
      w[53] = hc_bytealign_be (w[10], w[11], offset);
      w[52] = hc_bytealign_be (w[ 9], w[10], offset);
      w[51] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[50] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[49] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[48] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[47] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[46] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[45] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[44] = hc_bytealign_be (w[ 1], w[ 2], offset);
      w[43] = hc_bytealign_be (w[ 0], w[ 1], offset);
      w[42] = hc_bytealign_be (    0, w[ 0], offset);
      w[41] = 0;
      w[40] = 0;
      w[39] = 0;
      w[38] = 0;
      w[37] = 0;
      w[36] = 0;
      w[35] = 0;
      w[34] = 0;
      w[33] = 0;
      w[32] = 0;
      w[31] = 0;
      w[30] = 0;
      w[29] = 0;
      w[28] = 0;
      w[27] = 0;
      w[26] = 0;
      w[25] = 0;
      w[24] = 0;
      w[23] = 0;
      w[22] = 0;
      w[21] = 0;
      w[20] = 0;
      w[19] = 0;
      w[18] = 0;
      w[17] = 0;
      w[16] = 0;
      w[15] = 0;
      w[14] = 0;
      w[13] = 0;
      w[12] = 0;
      w[11] = 0;
      w[10] = 0;
      w[ 9] = 0;
      w[ 8] = 0;
      w[ 7] = 0;
      w[ 6] = 0;
      w[ 5] = 0;
      w[ 4] = 0;
      w[ 3] = 0;
      w[ 2] = 0;
      w[ 1] = 0;
      w[ 0] = 0;

      break;

    case 43:
      w[63] = hc_bytealign_be (w[19], w[20], offset);
      w[62] = hc_bytealign_be (w[18], w[19], offset);
      w[61] = hc_bytealign_be (w[17], w[18], offset);
      w[60] = hc_bytealign_be (w[16], w[17], offset);
      w[59] = hc_bytealign_be (w[15], w[16], offset);
      w[58] = hc_bytealign_be (w[14], w[15], offset);
      w[57] = hc_bytealign_be (w[13], w[14], offset);
      w[56] = hc_bytealign_be (w[12], w[13], offset);
      w[55] = hc_bytealign_be (w[11], w[12], offset);
      w[54] = hc_bytealign_be (w[10], w[11], offset);
      w[53] = hc_bytealign_be (w[ 9], w[10], offset);
      w[52] = hc_bytealign_be (w[ 8], w[ 9], offset);
      w[51] = hc_bytealign_be (w[ 7], w[ 8], offset);
      w[50] = hc_bytealign_be (w[ 6], w[ 7], offset);
      w[49] = hc_bytealign_be (w[ 5], w[ 6], offset);
      w[48] = hc_bytealign_be (w[ 4], w[ 5], offset);
      w[47] = hc_bytealign_be (w[ 3], w[ 4], offset);
      w[46] = hc_bytealign_be (w[ 2], w[ 3], offset);
      w[45] = hc_bytealign_be (w[ 1], w[ 2], offs