Crypt-Yescrypt
view release on metacpan or search on metacpan
src/yescrypt-opt.c view on Meta::CPAN
/*-
* Copyright 2009 Colin Percival
* Copyright 2012-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
/*
* AVX and especially XOP speed up Salsa20 a lot, but this mostly matters for
* classic scrypt and for YESCRYPT_WORM (which use 8 rounds of Salsa20 per
* sub-block), and much less so for YESCRYPT_RW (which uses 2 rounds of Salsa20
* per block except during pwxform S-box initialization).
*/
#ifdef __XOP__
#warning "Note: XOP is enabled. That's great."
#elif defined(__AVX__)
#warning "Note: AVX is enabled, which is great for classic scrypt and YESCRYPT_WORM, but is sometimes slightly slower than plain SSE2 for YESCRYPT_RW"
#elif defined(__SSE2__)
#warning "Note: AVX and XOP are not enabled, which is great for YESCRYPT_RW, but they would substantially improve performance at classic scrypt and YESCRYPT_WORM"
#elif defined(__x86_64__) || defined(__i386__)
#warning "SSE2 not enabled. Expect poor performance."
#else
#warning "Note: building generic code for non-x86. That's OK."
#endif
/*
* The SSE4 code version has fewer instructions than the generic SSE2 version,
* but all of the instructions are SIMD, thereby wasting the scalar execution
* units. Thus, the generic SSE2 version below actually runs faster on some
* CPUs due to its balanced mix of SIMD and scalar instructions.
*/
#undef USE_SSE4_FOR_32BIT
#ifdef __SSE2__
/*
* GCC before 4.9 would by default unnecessarily use store/load (without
* SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV.
* This was tracked as GCC bug 54349.
* "-mtune=corei7" works around this, but is only supported for GCC 4.6+.
* We use inline asm for pre-4.6 GCC, further down this file.
*/
#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \
!defined(__clang__) && !defined(__ICC)
#pragma GCC target ("tune=corei7")
#endif
#include <emmintrin.h>
#ifdef __XOP__
#include <x86intrin.h>
#endif
#elif defined(__SSE__)
#include <xmmintrin.h>
#endif
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sha256.h"
#include "sysendian.h"
#define YESCRYPT_INTERNAL
#include "yescrypt.h"
( run in 0.994 second using v1.01-cache-2.11-cpan-39bf76dae61 )