diff options
| -rw-r--r-- | Math_Ext.jai | 47 | ||||
| -rw-r--r-- | Math_Test.jai | 57 |
2 files changed, 64 insertions, 40 deletions
diff --git a/Math_Ext.jai b/Math_Ext.jai index 500962f..368779f 100644 --- a/Math_Ext.jai +++ b/Math_Ext.jai @@ -74,35 +74,53 @@ add :: (x: $Tx, y: $Ty, $USE_GENERIC: bool = false) -> result: $Tr, saturated: b result: Tr = ---; saturated: bool = ---; + S_ADD_ASM :: #string DONE #asm { - // Calculate limit based on x's sign. - mov limit: gpr, MAX; - mov sign: gpr, x; - shr.SIZE sign, BITS; - add.SIZE limit, sign; // If sign is 1, then limit will overflow from MAX to MIN. - mov result, x; - add.SIZE result, y; - seto saturated; - cmovo result, limit; + // Performance + // s8 | s16 | s32 | s64 + // 1.243 | 1.242 | 1.215 | 1.210 + // + mov result, -1; // Pre-set result with signed maximum: set all ones. + shr.SIZE result, 1; // Pre-set result with signed maximum: insert zero on MSB. + bt x, SHIFT; // Test signal bit (affect CF). + adc result, 0; // Overflow signed maximum to signed minimum if CF is set. + + add.SIZE x, y; // Add values (affect OF). + seto saturated; // Set saturated flat if OF. + cmovno result, x; // Move add-result to result if NOT OF. + + // Performance - a bit of improvement... with some more code. + // s8 | s16 | s32 | s64 + // 1.336 | 1.305 | 1.217 | 1.210 + // + // mov sign: gpr, x; // Copy x value to sign variable. + // mov limit: gpr, MAX; // Pre-set limit with signed maximum. + // shr.SIZE sign, SHIFT; // Get sign of x value. + // add.SIZE limit, sign; // If sign is 1, overflow from signed maximum to signed minimum. + // + // mov result, x; // Copy x value to result. + // add.SIZE result, y; // Add values (affect OF). + // seto saturated; // Set saturated flag if OF. + // cmovo result, limit; // Move limit to result if OF. } DONE #if Tr == s8 - #insert #run replace(replace(replace(S_ADD_ASM, ".SIZE", ".b"), "MAX", "127"), "BITS", "7"); + #insert #run replace(replace(S_ADD_ASM, ".SIZE", ".b"), "SHIFT", "7"); // , "MAX", "127") #if Tr == s16 - #insert #run replace(replace(replace(S_ADD_ASM, ".SIZE", ".w"), "MAX", "32767"), "BITS", "15"); + #insert #run replace(replace(S_ADD_ASM, ".SIZE", ".w"), "SHIFT", "15"); // , "MAX", "32767") #if Tr == s32 - #insert #run replace(replace(replace(S_ADD_ASM, ".SIZE", ".d"), "MAX", "2147483647"), "BITS", "31"); + #insert #run replace(replace(S_ADD_ASM, ".SIZE", ".d"), "SHIFT", "31"); // , "MAX", "2147483647") #if Tr == s64 - #insert #run replace(replace(replace(S_ADD_ASM, ".SIZE", ".q"), "MAX", "9223372036854775807"), "BITS", "63"); + #insert #run replace(replace(S_ADD_ASM, ".SIZE", ".q"), "SHIFT", "63"); // , "MAX", "9223372036854775807") U_ADD_ASM :: #string DONE #asm { - add.SIZE x, y; // Add values. mov result, -1; // Pre-set result with unsigned maximum. + add.SIZE x, y; // Add values (affect CF). setc saturated; // Set saturated flag if CF. cmovnc result, x; // Move add-result to result if NOT CF. } @@ -152,6 +170,7 @@ sub :: (x: $Tx, y: $Ty, $USE_GENERIC: bool = false) -> result: $Tr, overflow: bo result: Tr = ---; saturated: bool = ---; + S_SUB_ASM :: #string DONE #asm { // Calculate limit based on x's sign. diff --git a/Math_Test.jai b/Math_Test.jai index d03bd32..bd9990b 100644 --- a/Math_Test.jai +++ b/Math_Test.jai @@ -167,26 +167,23 @@ main :: () { performance_test :: ($operation: string, $type: Type, print_result: bool = true) -> ops_per_us_gen: float, ops_per_us_asm: float { NUM_TESTS :: 5000; - // DATA_SIZE :: 32768; - DATA_SIZE_BITS :: 96*1024*8; + DATA_SIZE_BITS :: 64*1024*8/2; #if type == s8 || type == u8 then - DATA_SIZE :: DATA_SIZE_BITS/3/8; + DATA_SIZE :: DATA_SIZE_BITS/8; else #if type == s16 || type == u16 then - DATA_SIZE :: DATA_SIZE_BITS/3/16; + DATA_SIZE :: DATA_SIZE_BITS/16; else #if type == s32 || type == u32 then - DATA_SIZE :: DATA_SIZE_BITS/3/32; + DATA_SIZE :: DATA_SIZE_BITS/32; else #if type == s64 || type == u64 then - DATA_SIZE :: DATA_SIZE_BITS/3/64; + DATA_SIZE :: DATA_SIZE_BITS/64; best_gen := 0.0; best_asm := 0.0; numbers_xgen: [..] type; numbers_ygen: [..] type; - numbers_zgen: [DATA_SIZE] type; numbers_xasm: [..] type; numbers_yasm: [..] type; - numbers_zasm: [DATA_SIZE] type; array_reserve(*numbers_xgen, DATA_SIZE); array_reserve(*numbers_ygen, DATA_SIZE); array_reserve(*numbers_xasm, DATA_SIZE); @@ -212,18 +209,21 @@ main :: () { array_add(*numbers_xasm, x); array_add(*numbers_yasm, y); } - + for 0..NUM_TESTS-1 { + r_gen: type = 0; + r_asm: type = 0; + time_gen := current_time_monotonic(); - for 0..DATA_SIZE-1 #insert #run replace("numbers_zgen[it] = OP(numbers_xgen[it], numbers_ygen[it], true);", "OP", operation); + for 0..DATA_SIZE-1 #insert #run replace("r_gen ^= OP(numbers_xgen[it], numbers_ygen[it], true);", "OP", operation); time_gen = current_time_monotonic() - time_gen; time_asm := current_time_monotonic(); - for 0..DATA_SIZE-1 #insert #run replace("numbers_zasm[it] = OP(numbers_xasm[it], numbers_yasm[it]);", "OP", operation); + for 0..DATA_SIZE-1 #insert #run replace("r_asm ^= OP(numbers_xasm[it], numbers_yasm[it]);", "OP", operation); time_asm = current_time_monotonic() - time_asm; - for 0..DATA_SIZE-1 assert(numbers_zgen[it] == numbers_zasm[it]); + assert(r_gen == r_asm); perf_gen := cast(float)DATA_SIZE/cast(float)to_nanoseconds(time_gen); perf_asm := cast(float)DATA_SIZE/cast(float)to_nanoseconds(time_asm); @@ -231,26 +231,31 @@ main :: () { best_asm = max(best_asm, perf_asm); } - if print_result { - if type == s8 || type == u8 write_string(" "); - print("% | % | % | %\n", type, best_gen, best_asm, DATA_SIZE); + tmp_context := context; + push_context tmp_context { + ff := *context.print_style.default_format_float; + ff.zero_removal = .NO; + ff.width = 7; + ff.trailing_width = 3; + + fi := *context.print_style.default_format_int; + fi.minimum_digits = 3; + + if print_result { + if type == s8 || type == u8 write_string(" "); + print("% | % | % | %\n", type, best_gen, best_asm, cast(int)(100*best_asm/best_gen)); + } } return best_gen, best_asm; } - ff := context.print_style.default_format_float; - ff.zero_removal = .NO; - ff.width = 7; - ff.trailing_width = 3; - context.print_style.default_format_float = ff; - write_strings( " | (ops / nsec) |\n", - " | generic | x64 asm |\n" + " T | generic | x64 asm | %\n" ); write_strings( - "--- | ----------------- |\n", + "--- | ----------------- | ---\n", " | add |\n" ); performance_test("add", u8); @@ -263,7 +268,7 @@ main :: () { performance_test("add", s64); write_strings( - "--- | ----------------- |\n", + "--- | ----------------- | ---\n", " | sub |\n" ); performance_test("sub", u8); @@ -276,7 +281,7 @@ main :: () { performance_test("sub", s64); write_strings( - "--- | ----------------- |\n", + "--- | ----------------- | ---\n", " | mul |\n" ); performance_test("mul", u8); @@ -289,7 +294,7 @@ main :: () { performance_test("mul", s64); write_strings( - "--- | ----------------- |\n", + "--- | ----------------- | ---\n", " | div |\n" ); performance_test("div", u8); |
