check in v3.8.0 source

2023-08-31 00:49:24 -07:00
parent 3ef498f9e6
commit 316795abde
1218 changed files with 562506 additions and 0 deletions
--- a/crypto/rc4/rc4-elf-x86_64.S
+++ b/crypto/rc4/rc4-elf-x86_64.S
@@ -0,0 +1,625 @@
+#include "x86_arch.h"
+.text	
+
+.hidden	OPENSSL_ia32cap_P
+
+.globl	RC4
+.type	RC4,@function
+.align	16
+RC4:
+	endbr64
+	orq	%rsi,%rsi
+	jne	.Lentry
+	retq
+.Lentry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+.Lprologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	.LRC4_CHAR
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	.Lloop1
+	btl	$IA32CAP_BIT0_INTEL,%r8d
+	jc	.Lintel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	.Loop8
+	subq	%rbx,%r11
+.Loop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	.Loop8
+.align	16
+.Loop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	.Loop8
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	testq	$-32,%r11
+	jz	.Lloop1
+	andq	$15,%rbx
+	jz	.Loop16_is_hot
+	subq	%rbx,%r11
+.Loop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+.Loop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+.Loop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	.Loop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lloop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.LRC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	.Lcloop1
+	jmp	.Lcloop8
+.align	16
+.Lcloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov0			
+	movq	%rax,%rbx
+.Lcmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov1			
+	movq	%rbx,%rax
+.Lcmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov2			
+	movq	%rax,%rbx
+.Lcmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov3			
+	movq	%rbx,%rax
+.Lcmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov4			
+	movq	%rax,%rbx
+.Lcmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov5			
+	movq	%rbx,%rax
+.Lcmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov6			
+	movq	%rax,%rbx
+.Lcmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov7			
+	movq	%rbx,%rax
+.Lcmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	.Lcloop8
+	cmpq	$0,%r11
+	jne	.Lcloop1
+	jmp	.Lexit
+.align	16
+.Lcloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	.Lcloop1
+	jmp	.Lexit
+
+.align	16
+.Lexit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+.Lepilogue:
+	retq
+.size	RC4,.-RC4
+.globl	RC4_set_key
+.type	RC4_set_key,@function
+.align	16
+RC4_set_key:
+	endbr64
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$IA32CAP_BIT0_INTELP4,%r8d
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	.Lw1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lw2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	.Lc1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.align	16
+.Lc2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	.Lcnowrap
+	movq	%rcx,%rsi
+.Lcnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	.Lc2ndloop
+	movl	$-1,256(%rdi)
+
+.align	16
+.Lexit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	retq
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,@function
+.align	16
+RC4_options:
+	endbr64
+	leaq	.Lopts(%rip),%rax
+	movl	OPENSSL_ia32cap_P(%rip),%edx
+	btl	$IA32CAP_BIT0_INTELP4,%edx
+	jc	.L8xchar
+	btl	$IA32CAP_BIT0_INTEL,%edx
+	jnc	.Ldone
+	addq	$25,%rax
+	retq
+.L8xchar:
+	addq	$12,%rax
+.Ldone:
+	retq
+.section	.rodata
+.align	64
+.Lopts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.align	64
+.text	
+.size	RC4_options,.-RC4_options
+#if defined(HAVE_GNU_STACK)
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/crypto/rc4/rc4-macosx-x86_64.S
+++ b/crypto/rc4/rc4-macosx-x86_64.S
@@ -0,0 +1,622 @@
+#include "x86_arch.h"
+.text	
+
+.private_extern	_OPENSSL_ia32cap_P
+
+.globl	_RC4
+
+.p2align	4
+_RC4:
+	
+	orq	%rsi,%rsi
+	jne	L$entry
+	retq
+L$entry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+L$prologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	L$RC4_CHAR
+	movl	_OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	L$loop1
+	btl	$IA32CAP_BIT0_INTEL,%r8d
+	jc	L$intel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	L$oop8
+	subq	%rbx,%r11
+L$oop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	L$oop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	L$oop8
+.p2align	4
+L$oop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	L$oop8
+	cmpq	$0,%r11
+	jne	L$loop1
+	jmp	L$exit
+
+.p2align	4
+L$intel:
+	testq	$-32,%r11
+	jz	L$loop1
+	andq	$15,%rbx
+	jz	L$oop16_is_hot
+	subq	%rbx,%r11
+L$oop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	L$oop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+L$oop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	L$oop16_enter
+.p2align	4
+L$oop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+L$oop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	L$oop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	L$loop1
+	jmp	L$exit
+
+.p2align	4
+L$loop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	L$loop1
+	jmp	L$exit
+
+.p2align	4
+L$RC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	L$cloop1
+	jmp	L$cloop8
+.p2align	4
+L$cloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	L$cmov0			
+	movq	%rax,%rbx
+L$cmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	L$cmov1			
+	movq	%rbx,%rax
+L$cmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	L$cmov2			
+	movq	%rax,%rbx
+L$cmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	L$cmov3			
+	movq	%rbx,%rax
+L$cmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	L$cmov4			
+	movq	%rax,%rbx
+L$cmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	L$cmov5			
+	movq	%rbx,%rax
+L$cmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	L$cmov6			
+	movq	%rax,%rbx
+L$cmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	L$cmov7			
+	movq	%rbx,%rax
+L$cmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	L$cloop8
+	cmpq	$0,%r11
+	jne	L$cloop1
+	jmp	L$exit
+.p2align	4
+L$cloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	L$cloop1
+	jmp	L$exit
+
+.p2align	4
+L$exit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+L$epilogue:
+	retq
+
+.globl	_RC4_set_key
+
+.p2align	4
+_RC4_set_key:
+	
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	_OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$IA32CAP_BIT0_INTELP4,%r8d
+	jc	L$c1stloop
+	jmp	L$w1stloop
+
+.p2align	4
+L$w1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	L$w1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.p2align	4
+L$w2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	L$w2ndloop
+	jmp	L$exit_key
+
+.p2align	4
+L$c1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	L$c1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.p2align	4
+L$c2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	L$cnowrap
+	movq	%rcx,%rsi
+L$cnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	L$c2ndloop
+	movl	$-1,256(%rdi)
+
+.p2align	4
+L$exit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	retq
+
+
+.globl	_RC4_options
+
+.p2align	4
+_RC4_options:
+	
+	leaq	L$opts(%rip),%rax
+	movl	_OPENSSL_ia32cap_P(%rip),%edx
+	btl	$IA32CAP_BIT0_INTELP4,%edx
+	jc	L$8xchar
+	btl	$IA32CAP_BIT0_INTEL,%edx
+	jnc	L$done
+	addq	$25,%rax
+	retq
+L$8xchar:
+	addq	$12,%rax
+L$done:
+	retq
+.section	__DATA,__const
+.p2align	6
+L$opts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.p2align	6
+.text	
+
--- a/crypto/rc4/rc4-masm-x86_64.S
+++ b/crypto/rc4/rc4-masm-x86_64.S
@@ -0,0 +1,723 @@
+; 1 "crypto/rc4/rc4-masm-x86_64.S.tmp"
+; 1 "<built-in>" 1
+; 1 "<built-in>" 3
+; 399 "<built-in>" 3
+; 1 "<command line>" 1
+; 1 "<built-in>" 2
+; 1 "crypto/rc4/rc4-masm-x86_64.S.tmp" 2
+OPTION	DOTNAME
+
+; 1 "./crypto/x86_arch.h" 1
+
+
+; 16 "./crypto/x86_arch.h"
+   
+
+
+
+
+
+   
+
+
+; 40 "./crypto/x86_arch.h"
+   
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+; 3 "crypto/rc4/rc4-masm-x86_64.S.tmp" 2
+.text$	SEGMENT ALIGN(64) 'CODE'
+EXTERN	OPENSSL_ia32cap_P:NEAR
+
+
+PUBLIC	RC4
+
+ALIGN	16
+RC4	PROC PUBLIC
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_RC4::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+	endbr64
+	or	rsi,rsi
+	jne	$L$entry
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$entry::
+	push	rbx
+	push	r12
+	push	r13
+$L$prologue::
+	mov	r11,rsi
+	mov	r12,rdx
+	mov	r13,rcx
+	xor	r10,r10
+	xor	rcx,rcx
+
+	lea	rdi,QWORD PTR[8+rdi]
+	mov	r10b,BYTE PTR[((-8))+rdi]
+	mov	cl,BYTE PTR[((-4))+rdi]
+	cmp	DWORD PTR[256+rdi],-1
+	je	$L$RC4_CHAR
+	mov	r8d,DWORD PTR[OPENSSL_ia32cap_P]
+	xor	rbx,rbx
+	inc	r10b
+	sub	rbx,r10
+	sub	r13,r12
+	mov	eax,DWORD PTR[r10*4+rdi]
+	test	r11,-16
+	jz	$L$loop1
+	bt	r8d,30
+	jc	$L$intel
+	and	rbx,7
+	lea	rsi,QWORD PTR[1+r10]
+	jz	$L$oop8
+	sub	r11,rbx
+$L$oop8_warmup::
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	DWORD PTR[r10*4+rdi],edx
+	add	al,dl
+	inc	r10b
+	mov	edx,DWORD PTR[rax*4+rdi]
+	mov	eax,DWORD PTR[r10*4+rdi]
+	xor	dl,BYTE PTR[r12]
+	mov	BYTE PTR[r12*1+r13],dl
+	lea	r12,QWORD PTR[1+r12]
+	dec	rbx
+	jnz	$L$oop8_warmup
+
+	lea	rsi,QWORD PTR[1+r10]
+	jmp	$L$oop8
+ALIGN	16
+$L$oop8::
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	ebx,DWORD PTR[rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[r10*4+rdi],edx
+	add	dl,al
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,bl
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	mov	eax,DWORD PTR[4+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[4+r10*4+rdi],edx
+	add	dl,bl
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	ebx,DWORD PTR[8+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[8+r10*4+rdi],edx
+	add	dl,al
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,bl
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	mov	eax,DWORD PTR[12+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[12+r10*4+rdi],edx
+	add	dl,bl
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	ebx,DWORD PTR[16+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[16+r10*4+rdi],edx
+	add	dl,al
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,bl
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	mov	eax,DWORD PTR[20+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[20+r10*4+rdi],edx
+	add	dl,bl
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	ebx,DWORD PTR[24+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[24+r10*4+rdi],edx
+	add	dl,al
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	sil,8
+	add	cl,bl
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	mov	eax,DWORD PTR[((-4))+rsi*4+rdi]
+	ror	r8,8
+	mov	DWORD PTR[28+r10*4+rdi],edx
+	add	dl,bl
+	mov	r8b,BYTE PTR[rdx*4+rdi]
+	add	r10b,8
+	ror	r8,8
+	sub	r11,8
+
+	xor	r8,QWORD PTR[r12]
+	mov	QWORD PTR[r12*1+r13],r8
+	lea	r12,QWORD PTR[8+r12]
+
+	test	r11,-8
+	jnz	$L$oop8
+	cmp	r11,0
+	jne	$L$loop1
+	jmp	$L$exit
+
+ALIGN	16
+$L$intel::
+	test	r11,-32
+	jz	$L$loop1
+	and	rbx,15
+	jz	$L$oop16_is_hot
+	sub	r11,rbx
+$L$oop16_warmup::
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	DWORD PTR[r10*4+rdi],edx
+	add	al,dl
+	inc	r10b
+	mov	edx,DWORD PTR[rax*4+rdi]
+	mov	eax,DWORD PTR[r10*4+rdi]
+	xor	dl,BYTE PTR[r12]
+	mov	BYTE PTR[r12*1+r13],dl
+	lea	r12,QWORD PTR[1+r12]
+	dec	rbx
+	jnz	$L$oop16_warmup
+
+	mov	rbx,rcx
+	xor	rcx,rcx
+	mov	cl,bl
+
+$L$oop16_is_hot::
+	lea	rsi,QWORD PTR[r10*4+rdi]
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	pxor	xmm0,xmm0
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[4+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],0
+	jmp	$L$oop16_enter
+ALIGN	16
+$L$oop16::
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	pxor	xmm2,xmm0
+	psllq	xmm1,8
+	pxor	xmm0,xmm0
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[4+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[rsi],edx
+	pxor	xmm2,xmm1
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],0
+	movdqu	XMMWORD PTR[r12*1+r13],xmm2
+	lea	r12,QWORD PTR[16+r12]
+$L$oop16_enter::
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	pxor	xmm1,xmm1
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[8+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[4+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],0
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[12+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[8+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],1
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[16+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[12+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],1
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[20+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[16+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],2
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[24+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[20+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],2
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[28+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[24+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],3
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[32+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[28+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],3
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[36+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[32+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],4
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[40+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[36+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],4
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[44+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[40+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],5
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[48+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[44+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],5
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[52+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[48+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],6
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	mov	eax,DWORD PTR[56+rsi]
+	movzx	ebx,bl
+	mov	DWORD PTR[52+rsi],edx
+	add	cl,al
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],6
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	add	al,dl
+	mov	ebx,DWORD PTR[60+rsi]
+	movzx	eax,al
+	mov	DWORD PTR[56+rsi],edx
+	add	cl,bl
+	pinsrw	xmm0,WORD PTR[rax*4+rdi],7
+	add	r10b,16
+	movdqu	xmm2,XMMWORD PTR[r12]
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],ebx
+	add	bl,dl
+	movzx	ebx,bl
+	mov	DWORD PTR[60+rsi],edx
+	lea	rsi,QWORD PTR[r10*4+rdi]
+	pinsrw	xmm1,WORD PTR[rbx*4+rdi],7
+	mov	eax,DWORD PTR[rsi]
+	mov	rbx,rcx
+	xor	rcx,rcx
+	sub	r11,16
+	mov	cl,bl
+	test	r11,-16
+	jnz	$L$oop16
+
+	psllq	xmm1,8
+	pxor	xmm2,xmm0
+	pxor	xmm2,xmm1
+	movdqu	XMMWORD PTR[r12*1+r13],xmm2
+	lea	r12,QWORD PTR[16+r12]
+
+	cmp	r11,0
+	jne	$L$loop1
+	jmp	$L$exit
+
+ALIGN	16
+$L$loop1::
+	add	cl,al
+	mov	edx,DWORD PTR[rcx*4+rdi]
+	mov	DWORD PTR[rcx*4+rdi],eax
+	mov	DWORD PTR[r10*4+rdi],edx
+	add	al,dl
+	inc	r10b
+	mov	edx,DWORD PTR[rax*4+rdi]
+	mov	eax,DWORD PTR[r10*4+rdi]
+	xor	dl,BYTE PTR[r12]
+	mov	BYTE PTR[r12*1+r13],dl
+	lea	r12,QWORD PTR[1+r12]
+	dec	r11
+	jnz	$L$loop1
+	jmp	$L$exit
+
+ALIGN	16
+$L$RC4_CHAR::
+	add	r10b,1
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	test	r11,-8
+	jz	$L$cloop1
+	jmp	$L$cloop8
+ALIGN	16
+$L$cloop8::
+	mov	r8d,DWORD PTR[r12]
+	mov	r9d,DWORD PTR[4+r12]
+	add	cl,al
+	lea	rsi,QWORD PTR[1+r10]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	esi,sil
+	movzx	ebx,BYTE PTR[rsi*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],al
+	cmp	rcx,rsi
+	mov	BYTE PTR[r10*1+rdi],dl
+	jne	$L$cmov0			
+	mov	rbx,rax
+$L$cmov0::
+	add	dl,al
+	xor	r8b,BYTE PTR[rdx*1+rdi]
+	ror	r8d,8
+	add	cl,bl
+	lea	r10,QWORD PTR[1+rsi]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	r10d,r10b
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],bl
+	cmp	rcx,r10
+	mov	BYTE PTR[rsi*1+rdi],dl
+	jne	$L$cmov1			
+	mov	rax,rbx
+$L$cmov1::
+	add	dl,bl
+	xor	r8b,BYTE PTR[rdx*1+rdi]
+	ror	r8d,8
+	add	cl,al
+	lea	rsi,QWORD PTR[1+r10]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	esi,sil
+	movzx	ebx,BYTE PTR[rsi*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],al
+	cmp	rcx,rsi
+	mov	BYTE PTR[r10*1+rdi],dl
+	jne	$L$cmov2			
+	mov	rbx,rax
+$L$cmov2::
+	add	dl,al
+	xor	r8b,BYTE PTR[rdx*1+rdi]
+	ror	r8d,8
+	add	cl,bl
+	lea	r10,QWORD PTR[1+rsi]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	r10d,r10b
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],bl
+	cmp	rcx,r10
+	mov	BYTE PTR[rsi*1+rdi],dl
+	jne	$L$cmov3			
+	mov	rax,rbx
+$L$cmov3::
+	add	dl,bl
+	xor	r8b,BYTE PTR[rdx*1+rdi]
+	ror	r8d,8
+	add	cl,al
+	lea	rsi,QWORD PTR[1+r10]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	esi,sil
+	movzx	ebx,BYTE PTR[rsi*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],al
+	cmp	rcx,rsi
+	mov	BYTE PTR[r10*1+rdi],dl
+	jne	$L$cmov4			
+	mov	rbx,rax
+$L$cmov4::
+	add	dl,al
+	xor	r9b,BYTE PTR[rdx*1+rdi]
+	ror	r9d,8
+	add	cl,bl
+	lea	r10,QWORD PTR[1+rsi]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	r10d,r10b
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],bl
+	cmp	rcx,r10
+	mov	BYTE PTR[rsi*1+rdi],dl
+	jne	$L$cmov5			
+	mov	rax,rbx
+$L$cmov5::
+	add	dl,bl
+	xor	r9b,BYTE PTR[rdx*1+rdi]
+	ror	r9d,8
+	add	cl,al
+	lea	rsi,QWORD PTR[1+r10]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	esi,sil
+	movzx	ebx,BYTE PTR[rsi*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],al
+	cmp	rcx,rsi
+	mov	BYTE PTR[r10*1+rdi],dl
+	jne	$L$cmov6			
+	mov	rbx,rax
+$L$cmov6::
+	add	dl,al
+	xor	r9b,BYTE PTR[rdx*1+rdi]
+	ror	r9d,8
+	add	cl,bl
+	lea	r10,QWORD PTR[1+rsi]
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	movzx	r10d,r10b
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],bl
+	cmp	rcx,r10
+	mov	BYTE PTR[rsi*1+rdi],dl
+	jne	$L$cmov7			
+	mov	rax,rbx
+$L$cmov7::
+	add	dl,bl
+	xor	r9b,BYTE PTR[rdx*1+rdi]
+	ror	r9d,8
+	lea	r11,QWORD PTR[((-8))+r11]
+	mov	DWORD PTR[r13],r8d
+	lea	r12,QWORD PTR[8+r12]
+	mov	DWORD PTR[4+r13],r9d
+	lea	r13,QWORD PTR[8+r13]
+
+	test	r11,-8
+	jnz	$L$cloop8
+	cmp	r11,0
+	jne	$L$cloop1
+	jmp	$L$exit
+ALIGN	16
+$L$cloop1::
+	add	cl,al
+	movzx	ecx,cl
+	movzx	edx,BYTE PTR[rcx*1+rdi]
+	mov	BYTE PTR[rcx*1+rdi],al
+	mov	BYTE PTR[r10*1+rdi],dl
+	add	dl,al
+	add	r10b,1
+	movzx	edx,dl
+	movzx	r10d,r10b
+	movzx	edx,BYTE PTR[rdx*1+rdi]
+	movzx	eax,BYTE PTR[r10*1+rdi]
+	xor	dl,BYTE PTR[r12]
+	lea	r12,QWORD PTR[1+r12]
+	mov	BYTE PTR[r13],dl
+	lea	r13,QWORD PTR[1+r13]
+	sub	r11,1
+	jnz	$L$cloop1
+	jmp	$L$exit
+
+ALIGN	16
+$L$exit::
+	sub	r10b,1
+	mov	DWORD PTR[((-8))+rdi],r10d
+	mov	DWORD PTR[((-4))+rdi],ecx
+
+	mov	r13,QWORD PTR[rsp]
+	mov	r12,QWORD PTR[8+rsp]
+	mov	rbx,QWORD PTR[16+rsp]
+	add	rsp,24
+$L$epilogue::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_RC4::
+RC4	ENDP
+PUBLIC	RC4_set_key
+
+ALIGN	16
+RC4_set_key	PROC PUBLIC
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_RC4_set_key::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+	endbr64
+	lea	rdi,QWORD PTR[8+rdi]
+	lea	rdx,QWORD PTR[rsi*1+rdx]
+	neg	rsi
+	mov	rcx,rsi
+	xor	eax,eax
+	xor	r9,r9
+	xor	r10,r10
+	xor	r11,r11
+
+	mov	r8d,DWORD PTR[OPENSSL_ia32cap_P]
+	bt	r8d,20
+	jc	$L$c1stloop
+	jmp	$L$w1stloop
+
+ALIGN	16
+$L$w1stloop::
+	mov	DWORD PTR[rax*4+rdi],eax
+	add	al,1
+	jnc	$L$w1stloop
+
+	xor	r9,r9
+	xor	r8,r8
+ALIGN	16
+$L$w2ndloop::
+	mov	r10d,DWORD PTR[r9*4+rdi]
+	add	r8b,BYTE PTR[rsi*1+rdx]
+	add	r8b,r10b
+	add	rsi,1
+	mov	r11d,DWORD PTR[r8*4+rdi]
+	cmovz	rsi,rcx
+	mov	DWORD PTR[r8*4+rdi],r10d
+	mov	DWORD PTR[r9*4+rdi],r11d
+	add	r9b,1
+	jnc	$L$w2ndloop
+	jmp	$L$exit_key
+
+ALIGN	16
+$L$c1stloop::
+	mov	BYTE PTR[rax*1+rdi],al
+	add	al,1
+	jnc	$L$c1stloop
+
+	xor	r9,r9
+	xor	r8,r8
+ALIGN	16
+$L$c2ndloop::
+	mov	r10b,BYTE PTR[r9*1+rdi]
+	add	r8b,BYTE PTR[rsi*1+rdx]
+	add	r8b,r10b
+	add	rsi,1
+	mov	r11b,BYTE PTR[r8*1+rdi]
+	jnz	$L$cnowrap
+	mov	rsi,rcx
+$L$cnowrap::
+	mov	BYTE PTR[r8*1+rdi],r10b
+	mov	BYTE PTR[r9*1+rdi],r11b
+	add	r9b,1
+	jnc	$L$c2ndloop
+	mov	DWORD PTR[256+rdi],-1
+
+ALIGN	16
+$L$exit_key::
+	xor	eax,eax
+	mov	DWORD PTR[((-8))+rdi],eax
+	mov	DWORD PTR[((-4))+rdi],eax
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_RC4_set_key::
+RC4_set_key	ENDP
+
+PUBLIC	RC4_options
+
+ALIGN	16
+RC4_options	PROC PUBLIC
+	endbr64
+	lea	rax,QWORD PTR[$L$opts]
+	mov	edx,DWORD PTR[OPENSSL_ia32cap_P]
+	bt	edx,20
+	jc	$L$8xchar
+	bt	edx,30
+	jnc	$L$done
+	add	rax,25
+	DB	0F3h,0C3h		;repret
+$L$8xchar::
+	add	rax,12
+$L$done::
+	DB	0F3h,0C3h		;repret
+.text$	ENDS
+.rdata	SEGMENT READONLY ALIGN(8)
+ALIGN	64
+$L$opts::
+DB	114,99,52,40,56,120,44,105,110,116,41,0
+DB	114,99,52,40,56,120,44,99,104,97,114,41,0
+DB	114,99,52,40,49,54,120,44,105,110,116,41,0
+ALIGN	64
+.rdata	ENDS
+.text$	SEGMENT ALIGN(64) 'CODE'
+RC4_options	ENDP
+
+.text$	ENDS
+END
+
--- a/crypto/rc4/rc4-md5-elf-x86_64.S
+++ b/crypto/rc4/rc4-md5-elf-x86_64.S
--- a/crypto/rc4/rc4-md5-macosx-x86_64.S
+++ b/crypto/rc4/rc4-md5-macosx-x86_64.S
--- a/crypto/rc4/rc4-md5-masm-x86_64.S
+++ b/crypto/rc4/rc4-md5-masm-x86_64.S
--- a/crypto/rc4/rc4-md5-mingw64-x86_64.S
+++ b/crypto/rc4/rc4-md5-mingw64-x86_64.S
--- a/crypto/rc4/rc4-mingw64-x86_64.S
+++ b/crypto/rc4/rc4-mingw64-x86_64.S
@@ -0,0 +1,645 @@
+#include "x86_arch.h"
+.text	
+
+
+
+.globl	RC4
+.def	RC4;	.scl 2;	.type 32;	.endef
+.p2align	4
+RC4:
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%rax
+.LSEH_begin_RC4:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+	endbr64
+	orq	%rsi,%rsi
+	jne	.Lentry
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	retq
+.Lentry:
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+.Lprologue:
+	movq	%rsi,%r11
+	movq	%rdx,%r12
+	movq	%rcx,%r13
+	xorq	%r10,%r10
+	xorq	%rcx,%rcx
+
+	leaq	8(%rdi),%rdi
+	movb	-8(%rdi),%r10b
+	movb	-4(%rdi),%cl
+	cmpl	$-1,256(%rdi)
+	je	.LRC4_CHAR
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	xorq	%rbx,%rbx
+	incb	%r10b
+	subq	%r10,%rbx
+	subq	%r12,%r13
+	movl	(%rdi,%r10,4),%eax
+	testq	$-16,%r11
+	jz	.Lloop1
+	btl	$IA32CAP_BIT0_INTEL,%r8d
+	jc	.Lintel
+	andq	$7,%rbx
+	leaq	1(%r10),%rsi
+	jz	.Loop8
+	subq	%rbx,%r11
+.Loop8_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop8_warmup
+
+	leaq	1(%r10),%rsi
+	jmp	.Loop8
+.p2align	4
+.Loop8:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	0(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,0(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,4(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	8(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,8(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	12(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,12(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	16(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,16(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	20(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,20(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	24(%rdi,%rsi,4),%ebx
+	rorq	$8,%r8
+	movl	%edx,24(%rdi,%r10,4)
+	addb	%al,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%sil
+	addb	%bl,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	movl	-4(%rdi,%rsi,4),%eax
+	rorq	$8,%r8
+	movl	%edx,28(%rdi,%r10,4)
+	addb	%bl,%dl
+	movb	(%rdi,%rdx,4),%r8b
+	addb	$8,%r10b
+	rorq	$8,%r8
+	subq	$8,%r11
+
+	xorq	(%r12),%r8
+	movq	%r8,(%r13,%r12,1)
+	leaq	8(%r12),%r12
+
+	testq	$-8,%r11
+	jnz	.Loop8
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.p2align	4
+.Lintel:
+	testq	$-32,%r11
+	jz	.Lloop1
+	andq	$15,%rbx
+	jz	.Loop16_is_hot
+	subq	%rbx,%r11
+.Loop16_warmup:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%rbx
+	jnz	.Loop16_warmup
+
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	movb	%bl,%cl
+
+.Loop16_is_hot:
+	leaq	(%rdi,%r10,4),%rsi
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	jmp	.Loop16_enter
+.p2align	4
+.Loop16:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm0,%xmm2
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm0
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	4(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,0(%rsi)
+	pxor	%xmm1,%xmm2
+	addb	%bl,%cl
+	pinsrw	$0,(%rdi,%rax,4),%xmm0
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+.Loop16_enter:
+	movl	(%rdi,%rcx,4),%edx
+	pxor	%xmm1,%xmm1
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	8(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,4(%rsi)
+	addb	%al,%cl
+	pinsrw	$0,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	12(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,8(%rsi)
+	addb	%bl,%cl
+	pinsrw	$1,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	16(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,12(%rsi)
+	addb	%al,%cl
+	pinsrw	$1,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	20(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,16(%rsi)
+	addb	%bl,%cl
+	pinsrw	$2,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	24(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,20(%rsi)
+	addb	%al,%cl
+	pinsrw	$2,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	28(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,24(%rsi)
+	addb	%bl,%cl
+	pinsrw	$3,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	32(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,28(%rsi)
+	addb	%al,%cl
+	pinsrw	$3,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	36(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,32(%rsi)
+	addb	%bl,%cl
+	pinsrw	$4,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	40(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,36(%rsi)
+	addb	%al,%cl
+	pinsrw	$4,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	44(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,40(%rsi)
+	addb	%bl,%cl
+	pinsrw	$5,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	48(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,44(%rsi)
+	addb	%al,%cl
+	pinsrw	$5,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	52(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,48(%rsi)
+	addb	%bl,%cl
+	pinsrw	$6,(%rdi,%rax,4),%xmm0
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movl	56(%rsi),%eax
+	movzbl	%bl,%ebx
+	movl	%edx,52(%rsi)
+	addb	%al,%cl
+	pinsrw	$6,(%rdi,%rbx,4),%xmm1
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	addb	%dl,%al
+	movl	60(%rsi),%ebx
+	movzbl	%al,%eax
+	movl	%edx,56(%rsi)
+	addb	%bl,%cl
+	pinsrw	$7,(%rdi,%rax,4),%xmm0
+	addb	$16,%r10b
+	movdqu	(%r12),%xmm2
+	movl	(%rdi,%rcx,4),%edx
+	movl	%ebx,(%rdi,%rcx,4)
+	addb	%dl,%bl
+	movzbl	%bl,%ebx
+	movl	%edx,60(%rsi)
+	leaq	(%rdi,%r10,4),%rsi
+	pinsrw	$7,(%rdi,%rbx,4),%xmm1
+	movl	(%rsi),%eax
+	movq	%rcx,%rbx
+	xorq	%rcx,%rcx
+	subq	$16,%r11
+	movb	%bl,%cl
+	testq	$-16,%r11
+	jnz	.Loop16
+
+	psllq	$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,(%r13,%r12,1)
+	leaq	16(%r12),%r12
+
+	cmpq	$0,%r11
+	jne	.Lloop1
+	jmp	.Lexit
+
+.p2align	4
+.Lloop1:
+	addb	%al,%cl
+	movl	(%rdi,%rcx,4),%edx
+	movl	%eax,(%rdi,%rcx,4)
+	movl	%edx,(%rdi,%r10,4)
+	addb	%dl,%al
+	incb	%r10b
+	movl	(%rdi,%rax,4),%edx
+	movl	(%rdi,%r10,4),%eax
+	xorb	(%r12),%dl
+	movb	%dl,(%r13,%r12,1)
+	leaq	1(%r12),%r12
+	decq	%r11
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.p2align	4
+.LRC4_CHAR:
+	addb	$1,%r10b
+	movzbl	(%rdi,%r10,1),%eax
+	testq	$-8,%r11
+	jz	.Lcloop1
+	jmp	.Lcloop8
+.p2align	4
+.Lcloop8:
+	movl	(%r12),%r8d
+	movl	4(%r12),%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov0			
+	movq	%rax,%rbx
+.Lcmov0:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov1			
+	movq	%rbx,%rax
+.Lcmov1:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov2			
+	movq	%rax,%rbx
+.Lcmov2:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov3			
+	movq	%rbx,%rax
+.Lcmov3:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r8b
+	rorl	$8,%r8d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov4			
+	movq	%rax,%rbx
+.Lcmov4:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov5			
+	movq	%rbx,%rax
+.Lcmov5:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%al,%cl
+	leaq	1(%r10),%rsi
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%sil,%esi
+	movzbl	(%rdi,%rsi,1),%ebx
+	movb	%al,(%rdi,%rcx,1)
+	cmpq	%rsi,%rcx
+	movb	%dl,(%rdi,%r10,1)
+	jne	.Lcmov6			
+	movq	%rax,%rbx
+.Lcmov6:
+	addb	%al,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	addb	%bl,%cl
+	leaq	1(%rsi),%r10
+	movzbl	(%rdi,%rcx,1),%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%r10,1),%eax
+	movb	%bl,(%rdi,%rcx,1)
+	cmpq	%r10,%rcx
+	movb	%dl,(%rdi,%rsi,1)
+	jne	.Lcmov7			
+	movq	%rbx,%rax
+.Lcmov7:
+	addb	%bl,%dl
+	xorb	(%rdi,%rdx,1),%r9b
+	rorl	$8,%r9d
+	leaq	-8(%r11),%r11
+	movl	%r8d,(%r13)
+	leaq	8(%r12),%r12
+	movl	%r9d,4(%r13)
+	leaq	8(%r13),%r13
+
+	testq	$-8,%r11
+	jnz	.Lcloop8
+	cmpq	$0,%r11
+	jne	.Lcloop1
+	jmp	.Lexit
+.p2align	4
+.Lcloop1:
+	addb	%al,%cl
+	movzbl	%cl,%ecx
+	movzbl	(%rdi,%rcx,1),%edx
+	movb	%al,(%rdi,%rcx,1)
+	movb	%dl,(%rdi,%r10,1)
+	addb	%al,%dl
+	addb	$1,%r10b
+	movzbl	%dl,%edx
+	movzbl	%r10b,%r10d
+	movzbl	(%rdi,%rdx,1),%edx
+	movzbl	(%rdi,%r10,1),%eax
+	xorb	(%r12),%dl
+	leaq	1(%r12),%r12
+	movb	%dl,(%r13)
+	leaq	1(%r13),%r13
+	subq	$1,%r11
+	jnz	.Lcloop1
+	jmp	.Lexit
+
+.p2align	4
+.Lexit:
+	subb	$1,%r10b
+	movl	%r10d,-8(%rdi)
+	movl	%ecx,-4(%rdi)
+
+	movq	(%rsp),%r13
+	movq	8(%rsp),%r12
+	movq	16(%rsp),%rbx
+	addq	$24,%rsp
+.Lepilogue:
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	retq
+.LSEH_end_RC4:
+.globl	RC4_set_key
+.def	RC4_set_key;	.scl 2;	.type 32;	.endef
+.p2align	4
+RC4_set_key:
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%rax
+.LSEH_begin_RC4_set_key:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+	endbr64
+	leaq	8(%rdi),%rdi
+	leaq	(%rdx,%rsi,1),%rdx
+	negq	%rsi
+	movq	%rsi,%rcx
+	xorl	%eax,%eax
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+
+	movl	OPENSSL_ia32cap_P(%rip),%r8d
+	btl	$IA32CAP_BIT0_INTELP4,%r8d
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
+
+.p2align	4
+.Lw1stloop:
+	movl	%eax,(%rdi,%rax,4)
+	addb	$1,%al
+	jnc	.Lw1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.p2align	4
+.Lw2ndloop:
+	movl	(%rdi,%r9,4),%r10d
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movl	(%rdi,%r8,4),%r11d
+	cmovzq	%rcx,%rsi
+	movl	%r10d,(%rdi,%r8,4)
+	movl	%r11d,(%rdi,%r9,4)
+	addb	$1,%r9b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.p2align	4
+.Lc1stloop:
+	movb	%al,(%rdi,%rax,1)
+	addb	$1,%al
+	jnc	.Lc1stloop
+
+	xorq	%r9,%r9
+	xorq	%r8,%r8
+.p2align	4
+.Lc2ndloop:
+	movb	(%rdi,%r9,1),%r10b
+	addb	(%rdx,%rsi,1),%r8b
+	addb	%r10b,%r8b
+	addq	$1,%rsi
+	movb	(%rdi,%r8,1),%r11b
+	jnz	.Lcnowrap
+	movq	%rcx,%rsi
+.Lcnowrap:
+	movb	%r10b,(%rdi,%r8,1)
+	movb	%r11b,(%rdi,%r9,1)
+	addb	$1,%r9b
+	jnc	.Lc2ndloop
+	movl	$-1,256(%rdi)
+
+.p2align	4
+.Lexit_key:
+	xorl	%eax,%eax
+	movl	%eax,-8(%rdi)
+	movl	%eax,-4(%rdi)
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	retq
+.LSEH_end_RC4_set_key:
+
+.globl	RC4_options
+.def	RC4_options;	.scl 2;	.type 32;	.endef
+.p2align	4
+RC4_options:
+	endbr64
+	leaq	.Lopts(%rip),%rax
+	movl	OPENSSL_ia32cap_P(%rip),%edx
+	btl	$IA32CAP_BIT0_INTELP4,%edx
+	jc	.L8xchar
+	btl	$IA32CAP_BIT0_INTEL,%edx
+	jnc	.Ldone
+	addq	$25,%rax
+	retq
+.L8xchar:
+	addq	$12,%rax
+.Ldone:
+	retq
+.section	.rodata
+.p2align	6
+.Lopts:
+.byte	114,99,52,40,56,120,44,105,110,116,41,0
+.byte	114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte	114,99,52,40,49,54,120,44,105,110,116,41,0
+.p2align	6
+.text	
+
--- a/crypto/rc4/rc4_enc.c
+++ b/crypto/rc4/rc4_enc.c
@@ -0,0 +1,254 @@
+/* $OpenBSD: rc4_enc.c,v 1.18 2022/11/26 16:08:54 tb Exp $ */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <endian.h>
+
+#include <openssl/rc4.h>
+#include "rc4_local.h"
+
+/* RC4 as implemented from a posting from
+ * Newsgroups: sci.crypt
+ * From: sterndark@netcom.com (David Sterndark)
+ * Subject: RC4 Algorithm revealed.
+ * Message-ID: <sternCvKL4B.Hyy@netcom.com>
+ * Date: Wed, 14 Sep 1994 06:35:31 GMT
+ */
+
+void
+RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
+    unsigned char *outdata)
+{
+	RC4_INT *d;
+	RC4_INT x, y,tx, ty;
+	size_t i;
+
+	x = key->x;
+	y = key->y;
+	d = key->data;
+
+#if defined(RC4_CHUNK)
+	/*
+	 * The original reason for implementing this(*) was the fact that
+	 * pre-21164a Alpha CPUs don't have byte load/store instructions
+	 * and e.g. a byte store has to be done with 64-bit load, shift,
+	 * and, or and finally 64-bit store. Peaking data and operating
+	 * at natural word size made it possible to reduce amount of
+	 * instructions as well as to perform early read-ahead without
+	 * suffering from RAW (read-after-write) hazard. This resulted
+	 * in ~40%(**) performance improvement on 21064 box with gcc.
+	 * But it's not only Alpha users who win here:-) Thanks to the
+	 * early-n-wide read-ahead this implementation also exhibits
+	 * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending
+	 * on sizeof(RC4_INT)).
+	 *
+	 * (*)	"this" means code which recognizes the case when input
+	 *	and output pointers appear to be aligned at natural CPU
+	 *	word boundary
+	 * (**)	i.e. according to 'apps/openssl speed rc4' benchmark,
+	 *	crypto/rc4/rc4speed.c exhibits almost 70% speed-up...
+	 *
+	 * Caveats.
+	 *
+	 * - RC4_CHUNK="unsigned long long" should be a #1 choice for
+	 *   UltraSPARC. Unfortunately gcc generates very slow code
+	 *   (2.5-3 times slower than one generated by Sun's WorkShop
+	 *   C) and therefore gcc (at least 2.95 and earlier) should
+	 *   always be told that RC4_CHUNK="unsigned long".
+	 *
+	 *					<appro@fy.chalmers.se>
+	 */
+
+# define RC4_STEP	( \
+			x=(x+1) &0xff,	\
+			tx=d[x],	\
+			y=(tx+y)&0xff,	\
+			ty=d[y],	\
+			d[y]=tx,	\
+			d[x]=ty,	\
+			(RC4_CHUNK)d[(tx+ty)&0xff]\
+			)
+
+	if ((((size_t)indata & (sizeof(RC4_CHUNK) - 1)) |
+	    ((size_t)outdata & (sizeof(RC4_CHUNK) - 1))) == 0 ) {
+		RC4_CHUNK ichunk, otp;
+
+		/*
+		 * I reckon we can afford to implement both endian
+		 * cases and to decide which way to take at run-time
+		 * because the machine code appears to be very compact
+		 * and redundant 1-2KB is perfectly tolerable (i.e.
+		 * in case the compiler fails to eliminate it:-). By
+		 * suggestion from Terrel Larson <terr@terralogic.net>.
+		 *
+		 * Special notes.
+		 *
+		 * - compilers (those I've tried) don't seem to have
+		 *   problems eliminating either the operators guarded
+		 *   by "if (sizeof(RC4_CHUNK)==8)" or the condition
+		 *   expressions themselves so I've got 'em to replace
+		 *   corresponding #ifdefs from the previous version;
+		 * - I chose to let the redundant switch cases when
+		 *   sizeof(RC4_CHUNK)!=8 be (were also #ifdefed
+		 *   before);
+		 * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in
+		 *   [LB]ESHFT guards against "shift is out of range"
+		 *   warnings when sizeof(RC4_CHUNK)!=8
+		 *
+		 *			<appro@fy.chalmers.se>
+		 */
+#if BYTE_ORDER == BIG_ENDIAN
+# define BESHFT(c)	(((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
+		for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) {
+			ichunk  = *(RC4_CHUNK *)indata;
+			otp = RC4_STEP << BESHFT(0);
+			otp |= RC4_STEP << BESHFT(1);
+			otp |= RC4_STEP << BESHFT(2);
+			otp |= RC4_STEP << BESHFT(3);
+			if (sizeof(RC4_CHUNK) == 8) {
+				otp |= RC4_STEP << BESHFT(4);
+				otp |= RC4_STEP << BESHFT(5);
+				otp |= RC4_STEP << BESHFT(6);
+				otp |= RC4_STEP << BESHFT(7);
+			}
+			*(RC4_CHUNK *)outdata = otp^ichunk;
+			indata += sizeof(RC4_CHUNK);
+			outdata += sizeof(RC4_CHUNK);
+		}
+#else
+# define LESHFT(c)	(((c)*8)&(sizeof(RC4_CHUNK)*8-1))
+		for (; len & (0 - sizeof(RC4_CHUNK)); len -= sizeof(RC4_CHUNK)) {
+			ichunk = *(RC4_CHUNK *)indata;
+			otp = RC4_STEP;
+			otp |= RC4_STEP << 8;
+			otp |= RC4_STEP << 16;
+			otp |= RC4_STEP << 24;
+			if (sizeof(RC4_CHUNK) == 8) {
+				otp |= RC4_STEP << LESHFT(4);
+				otp |= RC4_STEP << LESHFT(5);
+				otp |= RC4_STEP << LESHFT(6);
+				otp |= RC4_STEP << LESHFT(7);
+			}
+			*(RC4_CHUNK *)outdata = otp ^ ichunk;
+			indata += sizeof(RC4_CHUNK);
+			outdata += sizeof(RC4_CHUNK);
+		}
+#endif
+	}
+#endif
+#define LOOP(in,out) \
+		x=((x+1)&0xff); \
+		tx=d[x]; \
+		y=(tx+y)&0xff; \
+		d[x]=ty=d[y]; \
+		d[y]=tx; \
+		(out) = d[(tx+ty)&0xff]^ (in);
+
+#ifndef RC4_INDEX
+#define RC4_LOOP(a,b,i)	LOOP(*((a)++),*((b)++))
+#else
+#define RC4_LOOP(a,b,i)	LOOP(a[i],b[i])
+#endif
+
+	i = len >> 3;
+	if (i) {
+		for (;;) {
+			RC4_LOOP(indata, outdata, 0);
+			RC4_LOOP(indata, outdata, 1);
+			RC4_LOOP(indata, outdata, 2);
+			RC4_LOOP(indata, outdata, 3);
+			RC4_LOOP(indata, outdata, 4);
+			RC4_LOOP(indata, outdata, 5);
+			RC4_LOOP(indata, outdata, 6);
+			RC4_LOOP(indata, outdata, 7);
+#ifdef RC4_INDEX
+			indata += 8;
+			outdata += 8;
+#endif
+			if (--i == 0)
+				break;
+		}
+	}
+	i = len&0x07;
+	if (i) {
+		for (;;) {
+			RC4_LOOP(indata, outdata, 0);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 1);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 2);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 3);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 4);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 5);
+			if (--i == 0)
+				break;
+			RC4_LOOP(indata, outdata, 6);
+			if (--i == 0)
+				break;
+		}
+	}
+	key->x = x;
+	key->y = y;
+}
--- a/crypto/rc4/rc4_local.h
+++ b/crypto/rc4/rc4_local.h
@@ -0,0 +1,5 @@
+/* $OpenBSD: rc4_local.h,v 1.1 2022/11/26 16:08:54 tb Exp $ */
+
+#ifndef HEADER_RC4_LOCL_H
+#define HEADER_RC4_LOCL_H
+#endif
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c
@@ -0,0 +1,115 @@
+/* $OpenBSD: rc4_skey.c,v 1.15 2022/11/26 16:08:54 tb Exp $ */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/rc4.h>
+#include "rc4_local.h"
+#include <openssl/opensslv.h>
+
+const char *
+RC4_options(void)
+{
+#ifdef RC4_INDEX
+	if (sizeof(RC4_INT) == 1)
+		return("rc4(idx,char)");
+	else
+		return("rc4(idx,int)");
+#else
+	if (sizeof(RC4_INT) == 1)
+		return("rc4(ptr,char)");
+	else
+		return("rc4(ptr,int)");
+#endif
+}
+
+/* RC4 as implemented from a posting from
+ * Newsgroups: sci.crypt
+ * From: sterndark@netcom.com (David Sterndark)
+ * Subject: RC4 Algorithm revealed.
+ * Message-ID: <sternCvKL4B.Hyy@netcom.com>
+ * Date: Wed, 14 Sep 1994 06:35:31 GMT
+ */
+
+void
+RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+{
+	RC4_INT tmp;
+	int id1, id2;
+	RC4_INT *d;
+	unsigned int i;
+
+	d = &(key->data[0]);
+	key->x = 0;
+	key->y = 0;
+	id1 = id2 = 0;
+
+#define SK_LOOP(d,n) { \
+		tmp=d[(n)]; \
+		id2 = (data[id1] + tmp + id2) & 0xff; \
+		if (++id1 == len) id1=0; \
+		d[(n)]=d[id2]; \
+		d[id2]=tmp; }
+
+	for (i = 0; i < 256; i++)
+		d[i] = i;
+	for (i = 0; i < 256; i += 4) {
+		SK_LOOP(d, i + 0);
+		SK_LOOP(d, i + 1);
+		SK_LOOP(d, i + 2);
+		SK_LOOP(d, i + 3);
+	}
+}