Use non-temporal writes for faster page clearing.

Signed-off-by: Daniel J Blueman <daniel@numascale.com>

diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a2fe51b..ea6b272 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -14,25 +14,12 @@
  * %rdi	- page
  */
 ENTRY(clear_page)
-
-	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp clear_page_c_e", X86_FEATURE_ERMS
-
-	movl $4096/8,%ecx
-	xorl %eax,%eax
-	rep stosq
-	ret
-ENDPROC(clear_page)
-
-ENTRY(clear_page_orig)
-
 	xorl   %eax,%eax
-	movl   $4096/64,%ecx
-	.p2align 4
-.Lloop:
-	decl	%ecx
-#define PUT(x) movq %rax,x*8(%rdi)
-	movq %rax,(%rdi)
+	movl   $4096/128,%ecx
+	movl   $128,%edx
+loop:
+#define PUT(x) movnti %rax,x*8(%rdi)
+	PUT(0)
 	PUT(1)
 	PUT(2)
 	PUT(3)
@@ -40,15 +27,18 @@ ENTRY(clear_page_orig)
 	PUT(5)
 	PUT(6)
 	PUT(7)
-	leaq	64(%rdi),%rdi
-	jnz	.Lloop
-	nop
+	PUT(8)
+	PUT(9)
+	PUT(10)
+	PUT(11)
+	PUT(12)
+	PUT(13)
+	PUT(14)
+	PUT(15)
+	addq    %rdx,%rdi
+	decl	%ecx
+	jnz	loop
+	sfence
 	ret
-ENDPROC(clear_page_orig)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
-	movl $4096,%ecx
-	xorl %eax,%eax
-	rep stosb
-	ret
-ENDPROC(clear_page_c_e)
