deps: update base64 to 0.5.1

PR-URL: https://github.com/nodejs/node/pull/50629
Fixes: https://github.com/nodejs/node/issues/50561
Fixes: https://github.com/nodejs/node/pull/45091
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Mohammed Keyvanzadeh <mohammadkeyvanzade94@gmail.com>
Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com>
Reviewed-By: Richard Lau <rlau@redhat.com>
This commit is contained in:
Node.js GitHub Bot 2023-11-08 19:48:51 +00:00 committed by Luigi Pinca
parent 3cce03a03f
commit f45bb801b6
32 changed files with 1694 additions and 145 deletions

View File

@ -49,6 +49,7 @@
'HAVE_SSE42=1',
'HAVE_AVX=1',
'HAVE_AVX2=1',
'HAVE_AVX512=1',
],
'dependencies': [
'base64_ssse3',
@ -56,6 +57,7 @@
'base64_sse42',
'base64_avx',
'base64_avx2',
'base64_avx512',
],
}, {
'sources': [
@ -64,6 +66,7 @@
'base64/lib/arch/sse42/codec.c',
'base64/lib/arch/avx/codec.c',
'base64/lib/arch/avx2/codec.c',
'base64/lib/arch/avx512/codec.c',
],
}],
],
@ -165,6 +168,30 @@
],
},
{
'target_name': 'base64_avx512',
'type': 'static_library',
'include_dirs': [ 'base64/include', 'base64/lib' ],
'sources': [ 'base64/lib/arch/avx512/codec.c' ],
'defines': [ 'BASE64_STATIC_DEFINE', 'HAVE_AVX512=1' ],
'conditions': [
[ 'OS!="win"', {
'cflags': [ '-mavx512vl', '-mavx512vbmi' ],
'xcode_settings': {
'OTHER_CFLAGS': [ '-mavx512vl', '-mavx512vbmi' ]
},
}, {
'msvs_settings': {
'VCCLCompilerTool': {
'AdditionalOptions': [
'/arch:AVX512'
],
},
},
}],
],
},
{
'target_name': 'base64_neon32',
'type': 'static_library',

View File

@ -1,12 +1 @@
*.o
bin/base64
lib/config.h
test/benchmark
test/test_base64
# visual studio symbol db, etc.
.vs/
# build directory used by CMakePresets
out/
# private cmake presets
CMakeUserPresets.json
# Intentionally empty

View File

@ -17,7 +17,7 @@ if (POLICY CMP0127)
cmake_policy(SET CMP0127 NEW)
endif()
project(base64 LANGUAGES C VERSION 0.5.0)
project(base64 LANGUAGES C VERSION 0.5.1)
include(GNUInstallDirs)
include(CMakeDependentOption)
@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX512 BASE64_WITH_AVX512 "add AVX512 codepath")
cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
@ -118,6 +120,7 @@ add_library(base64
lib/arch/sse42/codec.c
lib/arch/avx/codec.c
lib/arch/avx2/codec.c
lib/arch/avx512/codec.c
lib/arch/neon32/codec.c
lib/arch/neon64/codec.c
@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
configure_codec(SSE42 __SSSE4_2__)
configure_codec(AVX)
configure_codec(AVX2)
configure_codec(AVX512)
elseif (_TARGET_ARCH STREQUAL "arm")
set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")

View File

@ -1,7 +1,7 @@
Copyright (c) 2005-2007, Nick Galbreath
Copyright (c) 2013-2019, Alfred Klomp
Copyright (c) 2015-2017, Wojciech Mula
Copyright (c) 2015-2018, Wojciech Muła
Copyright (c) 2016-2017, Matthieu Darbois
Copyright (c) 2013-2022, Alfred Klomp
All rights reserved.
Redistribution and use in source and binary forms, with or without

View File

@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
OBJCOPY ?= objcopy
OBJS = \
lib/arch/avx512/codec.o \
lib/arch/avx2/codec.o \
lib/arch/generic/codec.o \
lib/arch/neon32/codec.o \
@ -16,6 +17,7 @@ OBJS = \
lib/codec_choose.o \
lib/tables/tables.o
HAVE_AVX512 = 0
HAVE_AVX2 = 0
HAVE_NEON32 = 0
HAVE_NEON64 = 0
@ -26,6 +28,9 @@ HAVE_AVX = 0
# The user should supply compiler flags for the codecs they want to build.
# Check which codecs we're going to include:
ifdef AVX512_CFLAGS
HAVE_AVX512 = 1
endif
ifdef AVX2_CFLAGS
HAVE_AVX2 = 1
endif
@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
lib/config.h:
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
@ -75,6 +81,7 @@ lib/config.h:
$(OBJS): lib/config.h
$(OBJS): CFLAGS += -Ilib
lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)

View File

@ -3,7 +3,7 @@
[![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
This is an implementation of a base64 stream encoding/decoding library in C99
with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
[OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
to encode/decode simple length-delimited strings. This library aims to be:
@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
time, which gives a speedup of four or more times compared to the "plain"
bytewise codec.
AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
instructions. Decoding part reused AVX2 implementations. For CPUs later than
Cannonlake (manufactured in 2018) supports these instructions.
NEON support is hardcoded to on or off at compile time, because portable
runtime feature detection is unavailable on ARM.
@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
[articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
The AVX512 encoder is based on code from Wojciech Muła's
[base64simd](https://github.com/WojciechMula/base64simd) library.
The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
## Building
@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
make lib/libbase64.o
```
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
A typical build invocation on x86 looks like this:
```sh
@ -93,6 +100,15 @@ Example:
AVX2_CFLAGS=-mavx2 make
```
### AVX512
To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
Example:
```sh
AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
```
The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
### SSSE3
@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
The following constants can be used:
- `BASE64_FORCE_AVX2`
- `BASE64_FORCE_AVX512`
- `BASE64_FORCE_NEON32`
- `BASE64_FORCE_NEON64`
- `BASE64_FORCE_PLAIN`
@ -434,7 +451,7 @@ x86 processors
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* |
| i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* |
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 |
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243\* | 6233 | 4160\* | 6344 |
| Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - |
| Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - |
| Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - |

View File

@ -1,128 +1,477 @@
#include <stddef.h> // size_t
#include <stdio.h> // fopen()
#include <string.h> // strlen()
#define _XOPEN_SOURCE // IOV_MAX
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <limits.h>
#include <sys/uio.h>
#include "../include/libbase64.h"
#define BUFSIZE 1024 * 1024
// Size of the buffer for the "raw" (not base64-encoded) data in bytes.
#define BUFFER_RAW_SIZE (1024 * 1024)
static char buf[BUFSIZE];
static char out[(BUFSIZE * 5) / 3]; // Technically 4/3 of input, but take some margin
size_t nread;
size_t nout;
// Size of the buffer for the base64-encoded data in bytes. The base64-encoded
// data is 4/3 the size of the input, with some margin to be sure.
#define BUFFER_ENC_SIZE (BUFFER_RAW_SIZE * 4 / 3 + 16)
static int
enc (FILE *fp)
// Global config structure.
struct config {
// Name by which the program was called on the command line.
const char *name;
// Name of the input file for logging purposes.
const char *file;
// Input file handle.
FILE *fp;
// Wrap width in characters, for encoding only.
size_t wrap;
// Whether to run in decode mode.
bool decode;
// Whether to just print the help text and exit.
bool print_help;
};
// Input/output buffer structure.
struct buffer {
// Runtime-allocated buffer for raw (unencoded) data.
char *raw;
// Runtime-allocated buffer for base64-encoded data.
char *enc;
};
static bool
buffer_alloc (const struct config *config, struct buffer *buf)
{
int ret = 1;
if ((buf->raw = malloc(BUFFER_RAW_SIZE)) == NULL ||
(buf->enc = malloc(BUFFER_ENC_SIZE)) == NULL) {
free(buf->raw);
fprintf(stderr, "%s: malloc: %s\n",
config->name, strerror(errno));
return false;
}
return true;
}
static void
buffer_free (struct buffer *buf)
{
free(buf->raw);
free(buf->enc);
}
static bool
writev_retry (const struct config *config, struct iovec *iov, size_t nvec)
{
// Writing nothing always succeeds.
if (nvec == 0) {
return true;
}
while (true) {
ssize_t nwrite;
// Try to write the vectors to stdout.
if ((nwrite = writev(1, iov, nvec)) < 0) {
// Retry on EINTR.
if (errno == EINTR) {
continue;
}
// Quit on other errors.
fprintf(stderr, "%s: writev: %s\n",
config->name, strerror(errno));
return false;
}
// The return value of `writev' is the number of bytes written.
// To check for success, we traverse the list and remove all
// written vectors. The call succeeded if the list is empty.
while (true) {
// Retry if this vector is not or partially written.
if (iov->iov_len > (size_t) nwrite) {
char *base = iov->iov_base;
iov->iov_base = (size_t) nwrite + base;
iov->iov_len -= (size_t) nwrite;
break;
}
// Move to the next vector.
nwrite -= iov->iov_len;
iov++;
// Return successfully if all vectors were written.
if (--nvec == 0) {
return true;
}
}
}
}
static inline bool
iov_append (const struct config *config, struct iovec *iov,
size_t *nvec, char *base, const size_t len)
{
// Add the buffer to the IO vector array.
iov[*nvec].iov_base = base;
iov[*nvec].iov_len = len;
// Increment the array index. Flush the array if it is full.
if (++(*nvec) == IOV_MAX) {
if (writev_retry(config, iov, IOV_MAX) == false) {
return false;
}
*nvec = 0;
}
return true;
}
static bool
write_stdout (const struct config *config, const char *buf, size_t len)
{
while (len > 0) {
ssize_t nwrite;
// Try to write the buffer to stdout.
if ((nwrite = write(1, buf, len)) < 0) {
// Retry on EINTR.
if (errno == EINTR) {
continue;
}
// Quit on other errors.
fprintf(stderr, "%s: write: %s\n",
config->name, strerror(errno));
return false;
}
// Update the buffer position.
buf += (size_t) nwrite;
len -= (size_t) nwrite;
}
return true;
}
static bool
write_wrapped (const struct config *config, char *buf, size_t len)
{
static size_t col = 0;
// Special case: if buf is NULL, print final trailing newline.
if (buf == NULL) {
if (config->wrap > 0 && col > 0) {
return write_stdout(config, "\n", 1);
}
return true;
}
// If no wrap width is given, write the entire buffer.
if (config->wrap == 0) {
return write_stdout(config, buf, len);
}
// Statically allocated IO vector buffer.
static struct iovec iov[IOV_MAX];
size_t nvec = 0;
while (len > 0) {
// Number of characters to fill the current line.
size_t nwrite = config->wrap - col;
// Do not write more data than is available.
if (nwrite > len) {
nwrite = len;
}
// Append the data to the IO vector array.
if (iov_append(config, iov, &nvec, buf, nwrite) == false) {
return false;
}
// Advance the buffer.
len -= nwrite;
buf += nwrite;
col += nwrite;
// If the line is full, append a newline.
if (col == config->wrap) {
if (iov_append(config, iov, &nvec, "\n", 1) == false) {
return false;
}
col = 0;
}
}
// Write the remaining vectors.
if (writev_retry(config, iov, nvec) == false) {
return false;
}
return true;
}
static bool
encode (const struct config *config, struct buffer *buf)
{
size_t nread, nout;
struct base64_state state;
// Initialize the encoder's state structure.
base64_stream_encode_init(&state, 0);
while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
base64_stream_encode(&state, buf, nread, out, &nout);
if (nout) {
fwrite(out, nout, 1, stdout);
}
if (feof(fp)) {
break;
}
}
if (ferror(fp)) {
fprintf(stderr, "read error\n");
ret = 0;
goto out;
}
base64_stream_encode_final(&state, out, &nout);
// Read raw data into the buffer.
while ((nread = fread(buf->raw, 1, BUFFER_RAW_SIZE, config->fp)) > 0) {
if (nout) {
fwrite(out, nout, 1, stdout);
// Encode the raw input into the encoded buffer.
base64_stream_encode(&state, buf->raw, nread, buf->enc, &nout);
// Append the encoded data to the output stream.
if (write_wrapped(config, buf->enc, nout) == false) {
return false;
}
}
out: fclose(fp);
fclose(stdout);
return ret;
// Check for stream errors.
if (ferror(config->fp)) {
fprintf(stderr, "%s: %s: read error\n",
config->name, config->file);
return false;
}
// Finalize the encoding by adding proper stream terminators.
base64_stream_encode_final(&state, buf->enc, &nout);
// Append this tail to the output stream.
if (write_wrapped(config, buf->enc, nout) == false) {
return false;
}
// Print optional trailing newline.
if (write_wrapped(config, NULL, 0) == false) {
return false;
}
return true;
}
static int
dec (FILE *fp)
decode (const struct config *config, struct buffer *buf)
{
int ret = 1;
size_t nread, nout;
struct base64_state state;
// Initialize the decoder's state structure.
base64_stream_decode_init(&state, 0);
while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
if (!base64_stream_decode(&state, buf, nread, out, &nout)) {
fprintf(stderr, "decoding error\n");
ret = 0;
goto out;
// Read encoded data into the buffer. Use the smallest buffer size to
// be on the safe side: the decoded output will fit the raw buffer.
while ((nread = fread(buf->enc, 1, BUFFER_RAW_SIZE, config->fp)) > 0) {
// Decode the input into the raw buffer.
if (base64_stream_decode(&state, buf->enc, nread,
buf->raw, &nout) == 0) {
fprintf(stderr, "%s: %s: decoding error\n",
config->name, config->file);
return false;
}
if (nout) {
fwrite(out, nout, 1, stdout);
// Append the raw data to the output stream.
if (write_stdout(config, buf->raw, nout) == false) {
return false;
}
if (feof(fp)) {
}
// Check for stream errors.
if (ferror(config->fp)) {
fprintf(stderr, "%s: %s: read error\n",
config->name, config->file);
return false;
}
return true;
}
static void
usage (FILE *fp, const struct config *config)
{
const char *usage =
"Usage: %s [OPTION]... [FILE]\n"
"If no FILE is given or is specified as '-', "
"read from standard input.\n"
"Options:\n"
" -d, --decode Decode a base64 stream.\n"
" -h, --help Print this help text.\n"
" -w, --wrap=COLS Wrap encoded lines at this column. "
"Default 76, 0 to disable.\n";
fprintf(fp, usage, config->name);
}
static bool
get_wrap (struct config *config, const char *str)
{
char *eptr;
// Reject empty strings.
if (*str == '\0') {
return false;
}
// Convert the input string to a signed long.
const long wrap = strtol(str, &eptr, 10);
// Reject negative numbers.
if (wrap < 0) {
return false;
}
// Reject strings containing non-digits.
if (*eptr != '\0') {
return false;
}
config->wrap = (size_t) wrap;
return true;
}
static bool
parse_opts (int argc, char **argv, struct config *config)
{
int c;
static const struct option opts[] = {
{ "decode", no_argument, NULL, 'd' },
{ "help", no_argument, NULL, 'h' },
{ "wrap", required_argument, NULL, 'w' },
{ NULL }
};
// Remember the program's name.
config->name = *argv;
// Parse command line options.
while ((c = getopt_long(argc, argv, ":dhw:", opts, NULL)) != -1) {
switch (c) {
case 'd':
config->decode = true;
break;
case 'h':
config->print_help = true;
return true;
case 'w':
if (get_wrap(config, optarg) == false) {
fprintf(stderr,
"%s: invalid wrap value '%s'\n",
config->name, optarg);
return false;
}
break;
case ':':
fprintf(stderr, "%s: missing argument for '%c'\n",
config->name, optopt);
return false;
default:
fprintf(stderr, "%s: unknown option '%c'\n",
config->name, optopt);
return false;
}
}
if (ferror(fp)) {
fprintf(stderr, "read error\n");
ret = 0;
// Return successfully if no filename was given.
if (optind >= argc) {
return true;
}
out: fclose(fp);
fclose(stdout);
return ret;
// Return unsuccessfully if more than one filename was given.
if (optind + 1 < argc) {
fprintf(stderr, "%s: too many files\n", config->name);
return false;
}
// For compatibility with GNU Coreutils base64, treat a filename of '-'
// as standard input.
if (strcmp(argv[optind], "-") == 0) {
return true;
}
// Save the name of the file.
config->file = argv[optind];
// Open the file.
if ((config->fp = fopen(config->file, "rb")) == NULL) {
fprintf(stderr, "%s: %s: %s\n",
config->name, config->file, strerror(errno));
return false;
}
return true;
}
int
main (int argc, char **argv)
{
char *file;
FILE *fp;
int decode = 0;
// Default program config.
struct config config = {
.file = "stdin",
.fp = stdin,
.wrap = 76,
.decode = false,
.print_help = false,
};
struct buffer buf;
// Parse options:
for (;;)
{
int c;
int opt_index = 0;
static struct option opt_long[] = {
{ "decode", 0, 0, 'd' },
{ 0, 0, 0, 0 }
};
if ((c = getopt_long(argc, argv, "d", opt_long, &opt_index)) == -1) {
break;
}
switch (c)
{
case 'd':
decode = 1;
break;
}
}
// No options left on command line? Read from stdin:
if (optind >= argc) {
fp = stdin;
}
// One option left on command line? Treat it as a file:
else if (optind + 1 == argc) {
file = argv[optind];
if (strcmp(file, "-") == 0) {
fp = stdin;
}
else if ((fp = fopen(file, "rb")) == NULL) {
printf("cannot open %s\n", file);
return 1;
}
}
// More than one option left on command line? Syntax error:
else {
printf("Usage: %s <file>\n", argv[0]);
// Parse options from the command line.
if (parse_opts(argc, argv, &config) == false) {
usage(stderr, &config);
return 1;
}
// Invert return codes to create shell return code:
return (decode) ? !dec(fp) : !enc(fp);
// Return early if the user just wanted the help text.
if (config.print_help) {
usage(stdout, &config);
return 0;
}
// Allocate buffers.
if (buffer_alloc(&config, &buf) == false) {
return 1;
}
// Encode or decode the input based on the user's choice.
const bool ret = config.decode
? decode(&config, &buf)
: encode(&config, &buf);
// Free the buffers.
buffer_free(&buf);
// Close the input file.
fclose(config.fp);
// Close the output stream.
fclose(stdout);
// That's all, folks.
return ret ? 0 : 1;
}

View File

@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 "-msse4.2")
set(COMPILE_FLAGS_AVX "-mavx")
set(COMPILE_FLAGS_AVX2 "-mavx2")
set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
#arm
set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 " ")
set(COMPILE_FLAGS_AVX "/arch:AVX")
set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
endif()
endmacro(define_SIMD_compile_flags)

View File

@ -16,6 +16,9 @@
#cmakedefine01 BASE64_WITH_AVX2
#define HAVE_AVX2 BASE64_WITH_AVX2
#cmakedefine01 BASE64_WITH_AVX512
#define HAVE_AVX512 BASE64_WITH_AVX512
#cmakedefine01 BASE64_WITH_NEON32
#define HAVE_NEON32 BASE64_WITH_NEON32

View File

@ -53,6 +53,7 @@ extern "C" {
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_AVX512 (1 << 8)
struct base64_state {
int eof;

View File

@ -11,11 +11,25 @@
#if HAVE_AVX
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX_USE_ASM 1
# else
# define BASE64_AVX_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#include "../ssse3/enc_translate.c"
#include "../ssse3/enc_reshuffle.c"
#include "../ssse3/enc_loop.c"
#if BASE64_AVX_USE_ASM
# include "enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_AVX
@ -23,7 +37,17 @@ BASE64_ENC_FUNCTION(avx)
{
#if HAVE_AVX
#include "../generic/enc_head.c"
// For supported compilers, use a hand-optimized inline assembly
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
// AVX flags to generate better optimized AVX code.
#if BASE64_AVX_USE_ASM
enc_loop_avx(&s, &slen, &o, &olen);
#else
enc_loop_ssse3(&s, &slen, &o, &olen);
#endif
#include "../generic/enc_tail.c"
#else
BASE64_ENC_STUB

View File

@ -0,0 +1,264 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@ -11,11 +11,25 @@
#if HAVE_AVX2
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX2_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX2_USE_ASM 1
# else
# define BASE64_AVX2_USE_ASM 0
# endif
#endif
#include "dec_reshuffle.c"
#include "dec_loop.c"
#include "enc_translate.c"
#include "enc_reshuffle.c"
#include "enc_loop.c"
#if BASE64_AVX2_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_translate.c"
# include "enc_reshuffle.c"
# include "enc_loop.c"
#endif
#endif // HAVE_AVX2

View File

@ -0,0 +1,291 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round and a
// constant offset.
#define LOAD(R0, ROUND, OFFSET) \
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0, -4) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $24, %[src] \n\t" \
"add $32, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0, -4) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1, -4) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2, -4) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 32) {
return;
}
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
// bytes at a time an offset of -4, ensure that there will be at least
// 4 remaining bytes after the last round, so that the final read will
// not pass beyond the bounds of the input buffer.
size_t rounds = (*slen - 4) / 24;
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round
// Pre-decrement the number of rounds to get the number of rounds
// *after* the first round, which is handled as a special case.
rounds--;
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m256i lut0 = _mm256_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
const __m256i lut1 = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m256i a, b, c, d, e;
// Temporary register f doubles as the shift mask for the first round.
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
__asm__ volatile (
// The first loop iteration requires special handling to ensure
// that the read, which is normally done at an offset of -4,
// does not underflow the buffer. Load the buffer at an offset
// of 0 and permute the input to achieve the same effect.
LOAD("a", 0, 0)
"vpermd %[a], %[f], %[a] \n\t"
// Perform the standard shuffling and translation steps.
SHUF("a", "b", "c")
TRAN("a", "b", "c")
// Store the result and increment the source and dest pointers.
"vmovdqu %[a], (%[dst]) \n\t"
"add $24, %[src] \n\t"
"add $32, %[dst] \n\t"
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(24 * 36), %[src] \n\t"
" add $(32 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(24 * 18), %[src] \n\t"
" add $(32 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(24 * 9), %[src] \n\t"
" add $(32 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(24 * 6), %[src] \n\t"
" add $(32 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "+x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
[n51] "x" (_mm256_set1_epi8(51)),
[n25] "x" (_mm256_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@ -0,0 +1,42 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "../../../include/libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX512
#include <immintrin.h>
#include "../avx2/dec_reshuffle.c"
#include "../avx2/dec_loop.c"
#include "enc_reshuffle_translate.c"
#include "enc_loop.c"
#endif // HAVE_AVX512
BASE64_ENC_FUNCTION(avx512)
{
#if HAVE_AVX512
#include "../generic/enc_head.c"
enc_loop_avx512(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
BASE64_ENC_STUB
#endif
}
// Reuse AVX2 decoding. Not supporting AVX512 at present
BASE64_DEC_FUNCTION(avx512)
{
#if HAVE_AVX512
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
#endif
}

View File

@ -0,0 +1,61 @@
static inline void
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
{
// Load input.
__m512i src = _mm512_loadu_si512((__m512i *) *s);
// Reshuffle, translate, store.
src = enc_reshuffle_translate(src);
_mm512_storeu_si512((__m512i *) *o, src);
*s += 48;
*o += 64;
}
static inline void
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
// bytes at a time, ensure that there will be at least 24 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer.
size_t rounds = (*slen - 24) / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx512_inner(s, o);
break;
}
}

View File

@ -0,0 +1,50 @@
// AVX512 algorithm is based on permutevar and multishift. The code is based on
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
static inline __m512i
enc_reshuffle_translate (const __m512i input)
{
// 32-bit input
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// output order [1, 2, 0, 1]
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
0x04050304,
0x07080607,
0x0a0b090a,
0x0d0e0c0d,
0x10110f10,
0x13141213,
0x16171516,
0x191a1819,
0x1c1d1b1c,
0x1f201e1f,
0x22232122,
0x25262425,
0x28292728,
0x2b2c2a2b,
0x2e2f2d2e);
// Reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
// After multishift a single 32-bit lane has following layout
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
// 48, 54, 36, 42, 16, 22, 4, 10
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
// Translate immediatedly after reshuffled.
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
// Translation 6-bit values to ASCII.
return _mm512_permutexvar_epi8(shuffled_in, lookup);
}

View File

@ -100,7 +100,8 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
[n63] "w" (n63)
// Clobbers.
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
"cc", "memory"
);
}
#endif

View File

@ -160,7 +160,8 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
// Clobbers.
: "v2", "v3", "v4", "v5",
"v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15"
"v12", "v13", "v14", "v15",
"cc", "memory"
);
}

View File

@ -11,11 +11,25 @@
#if HAVE_SSE41
#include <smmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE41_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE41_USE_ASM 1
# else
# define BASE64_SSE41_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#include "../ssse3/enc_translate.c"
#include "../ssse3/enc_reshuffle.c"
#include "../ssse3/enc_loop.c"
#if BASE64_SSE41_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE41

View File

@ -11,11 +11,25 @@
#if HAVE_SSE42
#include <nmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE42_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE42_USE_ASM 1
# else
# define BASE64_SSE42_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#include "../ssse3/enc_translate.c"
#include "../ssse3/enc_reshuffle.c"
#include "../ssse3/enc_loop.c"
#if BASE64_SSE42_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE42

View File

@ -11,11 +11,27 @@
#if HAVE_SSSE3
#include <tmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
// registers, which is not enough to run the inline assembly.
#ifndef BASE64_SSSE3_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSSE3_USE_ASM 1
# else
# define BASE64_SSSE3_USE_ASM 0
# endif
#endif
#include "dec_reshuffle.c"
#include "dec_loop.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"
#include "enc_loop.c"
#if BASE64_SSSE3_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_translate.c"
# include "enc_loop.c"
#endif
#endif // HAVE_SSSE3

View File

@ -0,0 +1,268 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1) \
"pshufb %[lut0], %["R0"] \n\t" \
"movdqa %["R0"], %["R1"] \n\t" \
"pand %[msk0], %["R0"] \n\t" \
"pand %[msk2], %["R1"] \n\t" \
"pmulhuw %[msk1], %["R0"] \n\t" \
"pmullw %[msk3], %["R1"] \n\t" \
"por %["R1"], %["R0"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"movdqa %["R0"], %["R1"] \n\t" \
"movdqa %["R0"], %["R2"] \n\t" \
"psubusb %[n51], %["R1"] \n\t" \
"pcmpgtb %[n25], %["R2"] \n\t" \
"psubb %["R2"], %["R1"] \n\t" \
"movdqa %[lut1], %["R2"] \n\t" \
"pshufb %["R1"], %["R2"] \n\t" \
"paddb %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d") /* | + */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E) /* | | | | + */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A) /* + | | | | */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A) /* + | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A) /* + V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C) /* + | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@ -2,6 +2,7 @@
#include <stdint.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "../include/libbase64.h"
#include "codecs.h"
@ -10,7 +11,7 @@
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
#define BASE64_X86
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
#define BASE64_X86_SIMD
#endif
#endif
@ -31,7 +32,7 @@
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
#else
#include <cpuid.h>
#if HAVE_AVX2 || HAVE_AVX
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
static inline uint64_t _xgetbv (uint32_t index)
{
@ -45,6 +46,12 @@
#endif
#endif
#ifndef bit_AVX512vl
#define bit_AVX512vl (1 << 31)
#endif
#ifndef bit_AVX512vbmi
#define bit_AVX512vbmi (1 << 1)
#endif
#ifndef bit_AVX2
#define bit_AVX2 (1 << 5)
#endif
@ -75,6 +82,7 @@
BASE64_ENC_FUNCTION(arch); \
BASE64_DEC_FUNCTION(arch); \
BASE64_CODEC_FUNCS(avx512)
BASE64_CODEC_FUNCS(avx2)
BASE64_CODEC_FUNCS(neon32)
BASE64_CODEC_FUNCS(neon64)
@ -91,9 +99,10 @@ codec_choose_forced (struct codec *codec, int flags)
// always allow it, even if the codec is a no-op.
// For testing purposes.
if (!(flags & 0xFF)) {
if (!(flags & 0xFFFF)) {
return false;
}
if (flags & BASE64_FORCE_AVX2) {
codec->enc = base64_stream_encode_avx2;
codec->dec = base64_stream_decode_avx2;
@ -134,6 +143,11 @@ codec_choose_forced (struct codec *codec, int flags)
codec->dec = base64_stream_decode_avx;
return true;
}
if (flags & BASE64_FORCE_AVX512) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
return false;
}
@ -178,8 +192,8 @@ codec_choose_x86 (struct codec *codec)
max_level = __get_cpuid_max(0, NULL);
#endif
#if HAVE_AVX2 || HAVE_AVX
// Check for AVX/AVX2 support:
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
// Check for AVX/AVX2/AVX512 support:
// Checking for AVX requires 3 things:
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
// (allowing saving YMM registers on context switch)
@ -194,7 +208,17 @@ codec_choose_x86 (struct codec *codec)
if (ecx & bit_XSAVE_XRSTORE) {
uint64_t xcr_mask;
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
#if HAVE_AVX512
if (max_level >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
}
#endif
#if HAVE_AVX2
if (max_level >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);

View File

@ -68,7 +68,7 @@ void
base64_stream_decode_init (struct base64_state *state, int flags)
{
// If any of the codec flags are set, redo choice:
if (codec.dec == NULL || flags & 0xFF) {
if (codec.dec == NULL || flags & 0xFFFF) {
codec_choose(&codec, flags);
}
state->eof = 0;

View File

@ -11,12 +11,15 @@ else
BENCH_LDFLAGS=-lrt
endif
.PHONY: clean test
.PHONY: clean test valgrind
test: clean test_base64 benchmark
./test_base64
./benchmark
valgrind: clean test_base64
valgrind --error-exitcode=2 ./test_base64
test_base64: test_base64.c codec_supported.o ../lib/libbase64.o
$(CC) $(CFLAGS) -o $@ $^

37
deps/base64/base64/test/ci/analysis.sh vendored Executable file
View File

@ -0,0 +1,37 @@
#!/bin/bash
set -ve
MACHINE=$(uname -m)
export CC=gcc
uname -a
clang --version # make analyse
${CC} --version # make -C test valgrind
for USE_ASSEMBLY in 0 1; do
if [ "${MACHINE}" == "x86_64" ]; then
export SSSE3_CFLAGS="-mssse3 -DBASE64_SSSE3_USE_ASM=${USE_ASSEMBLY}"
export SSE41_CFLAGS="-msse4.1 -DBASE64_SSE41_USE_ASM=${USE_ASSEMBLY}"
export SSE42_CFLAGS="-msse4.2 -DBASE64_SSE42_USE_ASM=${USE_ASSEMBLY}"
export AVX_CFLAGS="-mavx -DBASE64_AVX_USE_ASM=${USE_ASSEMBLY}"
export AVX2_CFLAGS="-mavx2 -DBASE64_AVX2_USE_ASM=${USE_ASSEMBLY}"
# Temporarily disable AVX512; it is not available in CI yet.
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
elif [ "${MACHINE}" == "aarch64" ]; then
export NEON64_CFLAGS="-march=armv8-a"
elif [ "${MACHINE}" == "armv7l" ]; then
export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon"
fi
if [ ${USE_ASSEMBLY} -eq 0 ]; then
echo "::group::analyze"
make analyze
echo "::endgroup::"
fi
echo "::group::valgrind (USE_ASSEMBLY=${USE_ASSEMBLY})"
make clean
make
make -C test valgrind
echo "::endgroup::"
done

View File

@ -7,9 +7,11 @@ if [ "${MACHINE}" == "x86_64" ]; then
export SSE41_CFLAGS=-msse4.1
export SSE42_CFLAGS=-msse4.2
export AVX_CFLAGS=-mavx
# no AVX2 on GHA macOS
# no AVX2 or AVX512 on GHA macOS
if [ "$(uname -s)" != "Darwin" ]; then
export AVX2_CFLAGS=-mavx2
# Temporarily disable AVX512; it is not available in CI yet.
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
fi
elif [ "${MACHINE}" == "aarch64" ]; then
export NEON64_CFLAGS="-march=armv8-a"

View File

@ -11,6 +11,7 @@ static char *_codecs[] =
, "SSE41"
, "SSE42"
, "AVX"
, "AVX512"
, NULL
} ;

View File

@ -1,6 +1,7 @@
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "../include/libbase64.h"
#include "codec_supported.h"
#include "moby_dick.h"
@ -92,7 +93,7 @@ assert_roundtrip (int flags, const char *src)
}
static int
test_char_table (int flags)
test_char_table (int flags, bool use_malloc)
{
bool fail = false;
char chr[256];
@ -107,8 +108,24 @@ test_char_table (int flags)
for (int i = 0; i < 256; i++) {
size_t chrlen = 256 - i;
char* src = &chr[i];
if (use_malloc) {
src = malloc(chrlen); /* malloc/copy this so valgrind can find out-of-bound access */
if (src == NULL) {
printf(
"FAIL: encoding @ %d: allocation of %lu bytes failed\n",
i, (unsigned long)chrlen
);
fail = true;
continue;
}
memcpy(src, &chr[i], chrlen);
}
base64_encode(&chr[i], chrlen, enc, &enclen, BASE64_FORCE_PLAIN);
base64_encode(src, chrlen, enc, &enclen, flags);
if (use_malloc) {
free(src);
}
if (!base64_decode(enc, enclen, dec, &declen, flags)) {
printf("FAIL: decoding @ %d: decoding error\n", i);
@ -198,6 +215,11 @@ test_streaming (int flags)
while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) {
enclen += partlen;
inpos += bs;
// Has the entire buffer been consumed?
if (inpos >= 400) {
break;
}
}
if (enclen != 256) {
printf("FAIL: stream decoding gave incorrect size: "
@ -336,7 +358,8 @@ test_one_codec (const char *codec, int flags)
fail |= assert_roundtrip(flags, vec[i].out);
}
fail |= test_char_table(flags);
fail |= test_char_table(flags, false); /* test with unaligned input buffer */
fail |= test_char_table(flags, true); /* test for out-of-bound input read */
fail |= test_streaming(flags);
fail |= test_invalid_dec_input(flags);

View File

@ -10,7 +10,7 @@ This a list of all the dependencies:
* [acorn 8.11.2][]
* [ada 2.7.2][]
* [base64 0.5.0][]
* [base64 0.5.1][]
* [brotli 1.0.9][]
* [c-ares 1.20.1][]
* [cjs-module-lexer 1.2.2][]
@ -155,7 +155,7 @@ an abstract syntax tree walker for the ESTree format.
The [ada](https://github.com/ada-url/ada) dependency is a
fast and spec-compliant URL parser written in C++.
### base64 0.5.0
### base64 0.5.1
The [base64](https://github.com/aklomp/base64) dependency is a base64
stream encoding/decoding library in C99 with SIMD and OpenMP acceleration.
@ -320,7 +320,7 @@ performance improvements not currently available in standard zlib.
[acorn 8.11.2]: #acorn-8112
[ada 2.7.2]: #ada-272
[base64 0.5.0]: #base64-050
[base64 0.5.1]: #base64-051
[brotli 1.0.9]: #brotli-109
[c-ares 1.20.1]: #c-ares-1201
[cjs-module-lexer 1.2.2]: #cjs-module-lexer-122

View File

@ -2,5 +2,5 @@
// Refer to tools/dep_updaters/update-base64.sh
#ifndef SRC_BASE64_VERSION_H_
#define SRC_BASE64_VERSION_H_
#define BASE64_VERSION "0.5.0"
#define BASE64_VERSION "0.5.1"
#endif // SRC_BASE64_VERSION_H_