-
November 20th, 2013, 07:33 AM
#1
optimizing an algorithm or two
Hello all,
Same project as Ive been posting about. After the replies I got from my last post, Ive been spending some time redoing parts of my library. When I got to the assembly code and read about optimizing it I implemented some of the ideas and doubled how fast the algorithms are. comparing my code to Microsofts BigInteger library, I am running neck to neck in the division and modular reduction algorithms, about 5 times faster in my multiplication algorithm, and ten to fifteen times faster in my exponentiation and modular expontiation algorithms, however, in my addition algorithm I am running between 10 and 20 times slower than Microsoft, and about 5 to 10 times slower in my subtraction algorithm and for the life of me I am not seeing how they do it.
My tests are run from managed code (I am running Microsofts BigInteger library from the System.Numerics namespace in managed code) I generated two arrays one of Microsofts BigIntegers, the other of my BI library with a few thousand elements in each. The arrays are built up before running the tests and once in the tests the program simply feeds random entries from the arrays and measures the time it takes to run a predefined number of iterations for each algorithm.
The addition algorithm:
Code:
AAdd64 PROC aval1:PTR qword, aval2:PTR qword
;The add alg is pretty standard, find the size of the larger array and create and copy the larger array _one larger for the result
;then add the smaller val to it cell by cell till you reach the final cell in the smaller array
;Then the alg will adc the result array until there carry bit clears
;The final part checks to make sure n_one of the high order bytes are _zero, if they are it dec the array size till it hit a non-_zero cell
;save the registers we change
push rdi
push rsi
push r8
push r11
push r12
ENTERF ;this sets up the stack for us
mov r11, rcx ;move val1 into r11
mov r12, rdx ;move val2 into rr12
mov rcx, qword ptr[r11] ;move the element count of val1 into rcx
cmp rcx, qword ptr[r12] ;compare it to the element count of val2
ja V1Big ;if val1 has more elements then val2 jump to V1Big
xchg r11, r12 ;otherwise swap val1 and val2
mov rcx, qword ptr[r11] ;move the element count of val1 into rcx
V1Big:
add rcx, 5 ;add some padding for the return value
shl rcx, 3 ;multiply it by 8
MEMGET ;call our getmem routine
cmp rax, 0 ;see if there was an error getting our memory
je MallocError ;if there was jump to mallocerror
mov r8, rax ;move the pointer into r8
mov rdi, r8 ;and move it into rdi
mov rsi, r11 ;move the pointer to val1 into rsi
mov rcx, qword ptr[rsi] ;move the element count into rcx
inc rcx ;increment the element count so that it also copies the element count
mov qword ptr[r8+rcx*8], 0 ;zero out the byte above our most significant byte in r8
pushf ;save the flags register
cld ;clear the direction flag so it increments
rep movsq ;copy val1 into our return value
popf ;restore the flags register
mov rdi, 1 ;point to the first element in the arrays
mov rcx, qword ptr[r12] ;move the element count of val2 into rcx
clc ;clear the carry flag
AddLoop:
mov rax, qword ptr[r12+rdi*8] ;move the element pointed to by rdu into rax
adc qword ptr[r8+rdi*8], rax ;add with carry rax to the element pointed to by rdi in the return val
inc rdi ;increment rdi
loop AddLoop ;loop until we reach the last element in val2
jnc GoOut ;if there wasnt a carry on the last addition then we are done
dec rdi ;ootherwise decrement rdi
cmp rdi, qword ptr[r8] ;compare it to our element count in the return value
ja GoOut ;if it is above then we are done
inc rdi ;restore rdi to the original value
mov rcx, qword ptr[r8] ;move the element count in our return value into rcx
sub rcx, qword ptr[r12] ;subtract the element count in val2 (this gives us how many times we may have to loop)
cmovz rcx, _one ;if the subtraction result was zero then move one into rcx since we want this to run at least one time since there was a carry (and if it is zero then when it hits the loop instruction then rcx goes to 0xffffffff and runs on)
stc ;set the carry flag since we couldnt have got here without a carry from above
AddLoop2:
jnc GoOut ;if there is no carry then go out
adc qword ptr[r8+rdi*8], 0 ;all we are dealing with are the bytes in the return val that are above the most significant byte in val2 so we add with carry 0
inc rdi ;increment rdi to point to the next element
loop AddLoop2 ;loop until we reach the last element
GoOut:
mov rdi, qword ptr[r8] ;move the element count into rdi
inc rdi ;increment it by one to see if we changed the byte above the current MSB to one
cmp qword ptr[r8+rdi*8], 0
je Keep ;if it is zero then keep the element count we have
inc qword ptr[r8] ;otherwise increment the element count by one
Keep:
mov rcx, qword ptr[r8] ;move the element count in the return val into rcx
CheckLoop:
mov qword ptr[r8], rcx ;move the current count into the return val
cmp qword ptr[r8+rcx*8], 0 ;compare the current element to 0
ja CheckOut ;if its not zero then jump out
loop CheckLoop ;otherwise loop until rcx=0
CheckOut:
mov rax, r8 ;move our return value into rax
EXITF ;restore the stack
;restore the registers we used
pop r12
pop r11
pop r8
pop rsi
pop rdi
RET
MallocError:
;if there was an error requesting memory try to get the error code, store it in the return value and send it back to managed code
mov rcx, 32
MEMGET ;call our getmem routine
cmp rax, 0
je ErrorOut
mov r8, rax
call GetLastError
mov qword ptr[r8], 0
mov qword ptr[r8+8], 0
mov qword ptr[r8+16], rax
ErrorOut:
mov rax, rax
jmp GoOut
AAdd64 ENDP
and the subtraction algorithm:
Code:
ASub64 PROC sval1:PTR qword, sval2:PTR qword
;This alg is a standard subtraction alg
;it creates an answer array the size of _val1
;then subtracts the arrays cell by cell till it reaches the final cell of the smaller array
;at which time it continues doing subtractions with carry on the larger till all of the bytes are transfered
;the final part does the _zero check
;save the registers we use
push rdi
push rsi
push r8
push r11
push r12
ENTERF ;This macro sets up the stack
mov r11, rcx ;move val1 into r11
mov r12, rdx ;move val2 into r12
ItsAllGood:
mov rcx, qword ptr[r11] ;move the element count of val1 into rcx
add rcx, 10 ;give the return value some padding
shl rcx,3 ;multiply the result from above by 8
MEMGET ;call our getmem routine
mov r8, rax ;move the returned pointer into r8
cmp r8, 0 ;check if r8 is zero
je MallocError ;if it is then the memory manager errored out
mov rsi, r11 ;move r11 into rsi
mov rdi, r8 ;move r8 into rdi
mov rcx, qword ptr[r11] ;get the element count of val1
inc rcx ;increment it by one so it saves the element county too
mov qword ptr[r8+rcx*8], 0 ;move zero into the byte above the last expected byte to be returned (this is done so we get the correct count at the end of the alg)
pushf ;save the flags register
cld ;clear the direction flag so it increments
rep movsq ;copy val1 into our return val
popf ;restore the flags register
inc qword ptr[r8] ;increment the element count in r8 (so it points to 0)
mov rdi, 1 ;set rdi to point to the first element
mov rcx, qword ptr[r12] ;set rcx to the element count in val2
clc ;clear the carry flag
SubLoop:
mov rax, qword ptr[r12+rdi*8] ;move the element pointed to by rdi into rax
sbb qword ptr[r8+rdi*8], rax ;subtract with borrow rax from the element pointed to by rdi
inc rdi ;increment rdi
loop SubLoop ;loop until we reach the last element in val2
jnc CheckForZ ;if the carry flag is clear we are done
cmp rdi, qword ptr[r8] ;if rdi equals the element count in r8 or is bigger then we are done
jnb CheckForZ
mov rcx, qword ptr[r8] ;otherwise move r8's element count into rcx
sub rcx, qword ptr[r12] ;and subtract the element count in val2 from it (this gives us how many elements in val1 are left to deasl with)
stc ;set the carry flag since we couldnt have got here unless there was a borrow from the last subtract=ion above
CarryLoop:
jnc CheckForZ ;once the carry flag is clear we are done
sbb qword ptr[r8+rdi*8], 0 ;at this point we are only dealing with the elements in val1 and the borrow so we subtracts with borrow 0
inc rdi ;increment rdi to point to the next element
loop CarryLoop ;loop until we reach the last element in val1
CheckForZ:
mov rcx, qword ptr[r8] ;move the element count in r8 into rcx
CheckLoop:
mov qword ptr[r8], rcx ;move the element count into r8
cmp qword ptr[r8+rcx*8], 0 ;if r8 does not equal zero we are done
ja GoOut
loop CheckLoop ;loop until rcx=0
jmp GoOut ;go out
PrepSwitchEm:
popf
xchg r11, r12
jmp ItsAllGood
PrepReturnZero:
popf
ReturnZero:
mov rcx, 32
MEMGET ;call our getmem routine
cmp rax, 0
je MallocError
mov qword ptr[rax], 1
mov qword ptr[rax+8], 0
mov r8, rax
jmp GoOut
MallocError:
;if there was an error requesting memory try to get the error code, store it in the return value and send it back to managed code
mov rcx, 32
MEMGET ;call our getmem routine
cmp rax, 0
je ErrorOut
mov r8, rax
call GetLastError
mov qword ptr[r8], 0
mov qword ptr[r8+8], 0
mov qword ptr[r8+16], rax
ErrorOut:
mov rax, rax
GoOut:
mov rax, r8 ;move our return value into rax
EXITF ;restore the stack
pop r12
pop r11
pop r8
pop rsi
pop rdi
RET
ASub64 ENDP
I managed to get the algorithms changed around to take advantage of the adc and sbb instructions which got rid of a lot of ugly code but I cant for the life of me figure out how to squeeze any more out of these algorithms.
Thanks
Posting Permissions
- You may not post new threads
- You may not post replies
- You may not post attachments
- You may not edit your posts
-
Forum Rules
|
Click Here to Expand Forum to Full Width
|