this patch has not been tested with nasm or masm, but i think it should
work.  it has been tested only on unix with gas.

a new Configure option SSE2 has been added to enable this code.

see http://arctic.org/~dean/crypto/rsa.html for the latest info.

-dean

diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/Configure openssl-0.9.7c.sse2/Configure
--- openssl-0.9.7c/Configure	2003-09-28 07:07:01.000000000 -0700
+++ openssl-0.9.7c.sse2/Configure	2003-12-07 16:01:42.000000000 -0800
@@ -90,6 +90,7 @@
 # MD5_ASM	use some extra md5 assember,
 # SHA1_ASM	use some extra sha1 assember, must define L_ENDIAN for x86
 # RMD160_ASM	use some extra ripemd160 assember,
+# SSE2		use SSE2 in x86 assembly
 
 my $x86_gcc_des="DES_PTR DES_RISC1 DES_UNROLL";
 
@@ -628,6 +629,7 @@
 my $sha1_obj="";
 my $rmd160_obj="";
 my $processor="";
+my $sse2=0;
 my $default_ranlib;
 my $perl;
 
@@ -803,6 +805,8 @@
 			}
 		elsif (/^386$/)
 			{ $processor=386; }
+		elsif (/^SSE2$/)
+			{ $sse2=1; }
 		elsif (/^rsaref$/)
 			{
 			# No RSAref support any more since it's not needed.
@@ -1158,6 +1162,7 @@
 #	$rmd160_obj=$rmd160_enc;
 	$cflags.=" -DRMD160_ASM";
 	}
+$cflags.=" -DSSE2" if ($sse2);
 
 # "Stringify" the C flags string.  This permits it to be made part of a string
 # and works as well on command lines.
diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/crypto/bn/asm/bn-586.pl openssl-0.9.7c.sse2/crypto/bn/asm/bn-586.pl
--- openssl-0.9.7c/crypto/bn/asm/bn-586.pl	2000-12-06 08:30:23.000000000 -0800
+++ openssl-0.9.7c.sse2/crypto/bn/asm/bn-586.pl	2003-12-07 15:46:30.000000000 -0800
@@ -38,35 +38,108 @@
 	&and("ecx",0xfffffff8);	# num / 8
 	&mov($w,&wparam(3));	#
 
-	&push("ecx");		# Up the stack for a tmp variable
-
-	&jz(&label("maw_finish"));
-
-	&set_label("maw_loop",0);
-
-	&mov(&swtmp(0),"ecx");	#
-
-	for ($i=0; $i<32; $i+=4)
-		{
-		&comment("Round $i");
-
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
-		&mul($w);			# *a * w
-		&add("eax",$c);		# L(t)+= *r
-		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
-		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);		# L(t)+=c
-		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
-		&mov($c,"edx");			# c=  H(t);
-		}
-
-	&comment("");
-	&mov("ecx",&swtmp(0));	#
-	&add($a,32);
-	&add($r,32);
-	&sub("ecx",8);
-	&jnz(&label("maw_loop"));
+	if ($sse2) {
+		&jz(&label("maw_finish"));
+		&movd("mm0",$w);		# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+
+		&set_label("maw_loop",0);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&add($r,32);
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub("ecx",8);
+		&jnz(&label("maw_loop"));
+
+		&movd($c,"mm1");		# c = carry_out
+		&emms();
+	}
+	else {
+		&push("ecx");		# Up the stack for a tmp variable
+
+		&jz(&label("maw_finish"));
+
+		&set_label("maw_loop",0);
+
+		&mov(&swtmp(0),"ecx");	#
+
+		for ($i=0; $i<32; $i+=4)
+			{
+			&comment("Round $i");
+
+			&mov("eax",&DWP($i,$a,"",0)); 	# *a
+			&mul($w);			# *a * w
+			&add("eax",$c);		# L(t)+= *r
+			&mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+			&adc("edx",0);			# H(t)+=carry
+			&add("eax",$c);		# L(t)+=c
+			&adc("edx",0);			# H(t)+=carry
+			&mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+			&mov($c,"edx");			# c=  H(t);
+			}
+
+		&comment("");
+		&mov("ecx",&swtmp(0));	#
+		&add($a,32);
+		&add($r,32);
+		&sub("ecx",8);
+		&jnz(&label("maw_loop"));
+	}
 
 	&set_label("maw_finish",0);
 	&mov("ecx",&wparam(2));	# get num
@@ -93,7 +166,9 @@
 	&set_label("maw_end",0);
 	&mov("eax",$c);
 
-	&pop("ecx");	# clear variable from
+	if (!$sse2) {
+		&pop("ecx");	# clear variable from
+	}
 
 	&function_end($name);
 	}
diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/crypto/perlasm/x86asm.pl openssl-0.9.7c.sse2/crypto/perlasm/x86asm.pl
--- openssl-0.9.7c/crypto/perlasm/x86asm.pl	2003-01-12 08:35:27.000000000 -0800
+++ openssl-0.9.7c.sse2/crypto/perlasm/x86asm.pl	2003-12-07 15:46:26.000000000 -0800
@@ -48,7 +48,11 @@
 		}
 
 	$pic=0;
-	for (@ARGV) {	$pic=1 if (/\-[fK]PIC/i);	}
+	$sse2=0;
+	for (@ARGV) {
+		$pic=1 if (/\-[fK]PIC/i);
+		$sse2=1 if (/\-DSSE2/);
+	}
 
 	&asm_init_output();
 
diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/crypto/perlasm/x86ms.pl openssl-0.9.7c.sse2/crypto/perlasm/x86ms.pl
--- openssl-0.9.7c/crypto/perlasm/x86ms.pl	2003-09-27 15:14:47.000000000 -0700
+++ openssl-0.9.7c.sse2/crypto/perlasm/x86ms.pl	2003-12-07 15:59:09.000000000 -0800
@@ -161,6 +161,13 @@
 sub main'ret	{ &out0("ret"); }
 sub main'nop	{ &out0("nop"); }
 
+sub main'emms	{ &out0("emms"); }
+sub main'movd	{ &out2("movd",@_); }
+sub main'paddq	{ &out2("paddq",@_); }
+sub main'pmuludq{ &out2("pmuludq",@_); }
+sub main'psrlq	{ &out2("psrlq",@_); }
+sub main'pxor	{ &out2("pxor",@_); }
+
 sub out2
 	{
 	local($name,$p1,$p2)=@_;
diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/crypto/perlasm/x86nasm.pl openssl-0.9.7c.sse2/crypto/perlasm/x86nasm.pl
--- openssl-0.9.7c/crypto/perlasm/x86nasm.pl	2003-09-27 15:14:47.000000000 -0700
+++ openssl-0.9.7c.sse2/crypto/perlasm/x86nasm.pl	2003-12-07 15:58:52.000000000 -0800
@@ -170,6 +170,13 @@
 sub main'ret	{ &out0("ret"); }
 sub main'nop	{ &out0("nop"); }
 
+sub main'emms	{ &out0("emms"); }
+sub main'movd	{ &out2("movd",@_); }
+sub main'paddq	{ &out2("paddq",@_); }
+sub main'pmuludq{ &out2("pmuludq",@_); }
+sub main'psrlq	{ &out2("psrlq",@_); }
+sub main'pxor	{ &out2("pxor",@_); }
+
 sub out2
 	{
 	my($name,$p1,$p2)=@_;
diff -x 'Makefile*' -x buildinf.h -x opensslconf.h.bak -x '*.s' -ru openssl-0.9.7c/crypto/perlasm/x86unix.pl openssl-0.9.7c.sse2/crypto/perlasm/x86unix.pl
--- openssl-0.9.7c/crypto/perlasm/x86unix.pl	2003-09-27 15:14:47.000000000 -0700
+++ openssl-0.9.7c.sse2/crypto/perlasm/x86unix.pl	2003-12-07 14:15:25.000000000 -0800
@@ -51,6 +51,14 @@
 	'edi',	'%edi',
 	'ebp',	'%ebp',
 	'esp',	'%esp',
+	'mm0',	'%mm0',
+	'mm1',	'%mm1',
+	'mm2',	'%mm2',
+	'mm3',	'%mm3',
+	'mm4',	'%mm4',
+	'mm5',	'%mm5',
+	'mm6',	'%mm6',
+	'mm7',	'%mm7',
 	);
 
 %reg_val=(
@@ -174,6 +182,13 @@
 sub main'ret	{ &out0("ret"); }
 sub main'nop	{ &out0("nop"); }
 
+sub main'emms	{ &out0("emms"); }
+sub main'movd	{ &out2("movd",@_); }
+sub main'paddq	{ &out2("paddq",@_); }
+sub main'pmuludq{ &out2("pmuludq",@_); }
+sub main'psrlq	{ &out2("psrlq",@_); }
+sub main'pxor	{ &out2("pxor",@_); }
+
 # The bswapl instruction is new for the 486. Emulate if i386.
 sub main'bswap
 	{
