libiberty: Use x86 HW optimized sha1

Nick has approved this patch (+ small ld change to use it for --build-id=),
so I'm commiting it to GCC as master as well.

If anyone from ARM would be willing to implement it similarly with
vsha1{cq,mq,pq,h,su0q,su1q}_u32 intrinsics, it could be a useful linker
speedup on those hosts as well, the intent in sha1.c was that
sha1_hw_process_bytes, sha1_hw_process_block functions
would be defined whenever
defined (HAVE_X86_SHA1_HW_SUPPORT) || defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT)
but the body of sha1_hw_process_block and sha1_choose_process_bytes
would then have #elif defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT) for
the other arch support, similarly for any target attributes on
sha1_hw_process_block if needed.

2023-11-28  Jakub Jelinek  <jakub@redhat.com>

include/
	* sha1.h (sha1_process_bytes_fn): New typedef.
	(sha1_choose_process_bytes): Declare.
libiberty/
	* configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
	* sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
	and cpuid.h.
	(sha1_hw_process_bytes, sha1_hw_process_block,
	sha1_choose_process_bytes): New functions.
	* config.in: Regenerated.
	* configure: Regenerated.
This commit is contained in:
Jakub Jelinek 2023-11-28 13:14:05 +01:00
parent 9f3f0b829b
commit bf4f40cc31
5 changed files with 413 additions and 0 deletions

View File

@ -108,6 +108,13 @@ extern void sha1_process_block (const void *buffer, size_t len,
extern void sha1_process_bytes (const void *buffer, size_t len, extern void sha1_process_bytes (const void *buffer, size_t len,
struct sha1_ctx *ctx); struct sha1_ctx *ctx);
typedef void (*sha1_process_bytes_fn) (const void *, size_t,
struct sha1_ctx *);
/* Return sha1_process_bytes or some hardware optimized version thereof
depending on current CPU. */
extern sha1_process_bytes_fn sha1_choose_process_bytes (void);
/* Process the remaining bytes in the buffer and put result from CTX /* Process the remaining bytes in the buffer and put result from CTX
in first 20 bytes following RESBUF. The result is always in little in first 20 bytes following RESBUF. The result is always in little
endian byte order, so that a byte-wise output yields to the wanted endian byte order, so that a byte-wise output yields to the wanted

View File

@ -441,6 +441,9 @@
/* Define to 1 if `vfork' works. */ /* Define to 1 if `vfork' works. */
#undef HAVE_WORKING_VFORK #undef HAVE_WORKING_VFORK
/* Define if you have x86 SHA1 HW acceleration support. */
#undef HAVE_X86_SHA1_HW_SUPPORT
/* Define to 1 if you have the `_doprnt' function. */ /* Define to 1 if you have the `_doprnt' function. */
#undef HAVE__DOPRNT #undef HAVE__DOPRNT

58
libiberty/configure vendored
View File

@ -7546,6 +7546,64 @@ case "${host}" in
esac esac
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SHA1 HW acceleration support" >&5
$as_echo_n "checking for SHA1 HW acceleration support... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <x86intrin.h>
#include <cpuid.h>
__attribute__((__target__ ("sse4.1,sha")))
void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
{
__m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
__m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b);
const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
msg0 = _mm_xor_si128 (msg0, msg1);
e0 = _mm_add_epi32 (e0, msg0);
e0 = abcd;
_mm_storeu_si128 (buf, abcd);
e = _mm_extract_epi32 (e0, 3);
}
int bar (void)
{
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return 1;
return 0;
}
int
main ()
{
bar ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: x86 SHA1" >&5
$as_echo "x86 SHA1" >&6; }
$as_echo "#define HAVE_X86_SHA1_HW_SUPPORT 1" >>confdefs.h
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext

View File

@ -742,6 +742,46 @@ case "${host}" in
esac esac
AC_SUBST(pexecute) AC_SUBST(pexecute)
AC_MSG_CHECKING([for SHA1 HW acceleration support])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <x86intrin.h>
#include <cpuid.h>
__attribute__((__target__ ("sse4.1,sha")))
void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
{
__m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
__m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b);
const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
msg0 = _mm_xor_si128 (msg0, msg1);
e0 = _mm_add_epi32 (e0, msg0);
e0 = abcd;
_mm_storeu_si128 (buf, abcd);
e = _mm_extract_epi32 (e0, 3);
}
int bar (void)
{
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return 1;
return 0;
}
]], [[bar ();]])],
[AC_MSG_RESULT([x86 SHA1])
AC_DEFINE(HAVE_X86_SHA1_HW_SUPPORT, 1,
[Define if you have x86 SHA1 HW acceleration support.])],
[AC_MSG_RESULT([no])])
libiberty_AC_FUNC_STRNCMP libiberty_AC_FUNC_STRNCMP
# Install a library built with a cross compiler in $(tooldir) rather # Install a library built with a cross compiler in $(tooldir) rather

View File

@ -29,6 +29,11 @@
#include <stddef.h> #include <stddef.h>
#include <string.h> #include <string.h>
#ifdef HAVE_X86_SHA1_HW_SUPPORT
# include <x86intrin.h>
# include <cpuid.h>
#endif
#if USE_UNLOCKED_IO #if USE_UNLOCKED_IO
# include "unlocked-io.h" # include "unlocked-io.h"
#endif #endif
@ -412,3 +417,303 @@ sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
e = ctx->E += e; e = ctx->E += e;
} }
} }
#if defined(HAVE_X86_SHA1_HW_SUPPORT)
/* HW specific version of sha1_process_bytes. */
static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
static void
sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
{
/* When we already have some bits in our internal buffer concatenate
both inputs first. */
if (ctx->buflen != 0)
{
size_t left_over = ctx->buflen;
size_t add = 128 - left_over > len ? len : 128 - left_over;
memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
ctx->buflen += add;
if (ctx->buflen > 64)
{
sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
ctx->buflen &= 63;
/* The regions in the following copy operation cannot overlap. */
memcpy (ctx->buffer,
&((char *) ctx->buffer)[(left_over + add) & ~63],
ctx->buflen);
}
buffer = (const char *) buffer + add;
len -= add;
}
/* Process available complete blocks. */
if (len >= 64)
{
#if !_STRING_ARCH_unaligned
# define alignof(type) offsetof (struct { char c; type x; }, x)
# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
if (UNALIGNED_P (buffer))
while (len > 64)
{
sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
buffer = (const char *) buffer + 64;
len -= 64;
}
else
#endif
{
sha1_hw_process_block (buffer, len & ~63, ctx);
buffer = (const char *) buffer + (len & ~63);
len &= 63;
}
}
/* Move remaining bytes in internal buffer. */
if (len > 0)
{
size_t left_over = ctx->buflen;
memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
left_over += len;
if (left_over >= 64)
{
sha1_hw_process_block (ctx->buffer, 64, ctx);
left_over -= 64;
memmove (ctx->buffer, &ctx->buffer[16], left_over);
}
ctx->buflen = left_over;
}
}
/* Process LEN bytes of BUFFER, accumulating context into CTX.
Using CPU specific intrinsics. */
#ifdef HAVE_X86_SHA1_HW_SUPPORT
__attribute__((__target__ ("sse4.1,sha")))
#endif
static void
sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
{
#ifdef HAVE_X86_SHA1_HW_SUPPORT
/* Implemented from
https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */
const __m128i *words = (const __m128i *) buffer;
const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
__m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
const __m128i shuf_mask
= _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
char check[((offsetof (struct sha1_ctx, B)
== offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
&& (offsetof (struct sha1_ctx, C)
== offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
&& (offsetof (struct sha1_ctx, D)
== offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
? 1 : -1];
/* First increment the byte count. RFC 1321 specifies the possible
length of the file up to 2^64 bits. Here we only compute the
number of bytes. Do a double word increment. */
ctx->total[0] += len;
ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
(void) &check[0];
abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
while (words < endp)
{
abcd_save = abcd;
e0_save = e0;
/* 0..3 */
msg0 = _mm_loadu_si128 (words);
msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
e0 = _mm_add_epi32 (e0, msg0);
e1 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
/* 4..7 */
msg1 = _mm_loadu_si128 (words + 1);
msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
/* 8..11 */
msg2 = _mm_loadu_si128 (words + 2);
msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 12..15 */
msg3 = _mm_loadu_si128 (words + 3);
msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 16..19 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 20..23 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 24..27 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 28..31 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 32..35 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 36..39 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 40..43 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 44..47 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 48..51 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 52..55 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 56..59 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 60..63 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 64..67 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 68..71 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 72..75 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
/* 76..79 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
/* Finalize. */
e0 = _mm_sha1nexte_epu32 (e0, e0_save);
abcd = _mm_add_epi32 (abcd, abcd_save);
words = words + 4;
}
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
_mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
ctx->E = _mm_extract_epi32 (e0, 3);
#endif
}
#endif
/* Return sha1_process_bytes or some hardware optimized version thereof
depending on current CPU. */
sha1_process_bytes_fn
sha1_choose_process_bytes (void)
{
#ifdef HAVE_X86_SHA1_HW_SUPPORT
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return sha1_hw_process_bytes;
#endif
return sha1_process_bytes;
}